379 files changed, 130509 insertions, 8063 deletions
diff --git a/src/core/NEON/kernels/arm_conv/addressing.cpp b/src/core/NEON/kernels/arm_conv/addressing.cpp
new file mode 100644
index 0000000000..2460398880
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/addressing.cpp
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "addressing.hpp"
+#include "utils.hpp"
+#include <algorithm>
+#include <cstring>
+
+namespace arm_conv {
+namespace addressing {
+
+void fill_pointer_array(
+  size_t element_size,
+  void **dest_raw, const unsigned int array_rows, const unsigned int array_cols,
+  void *base_ptr_raw, size_t ld_row, size_t ld_col,
+  void *pad_buffer_raw,
+  const unsigned int pad_top, const unsigned int valid_rows,
+  const unsigned int pad_left, const unsigned int valid_cols
+)
+{
+  auto dest = reinterpret_cast<char **>(dest_raw);
+  auto base_ptr = reinterpret_cast<char *>(base_ptr_raw);
+  auto pad_buffer = reinterpret_cast<char *>(pad_buffer_raw);
+  ld_row *= element_size;
+  ld_col *= element_size;
+
+  const auto last_valid_row = std::min(pad_top + valid_rows, array_rows);
+  const auto last_valid_col = std::min(pad_left + valid_cols, array_cols);
+
+  unsigned int i = 0;
+  for (; i < pad_top; i++)
+  {
+    for (unsigned int j = 0; j < array_cols; j++)
+    {
+      *(dest++) = pad_buffer;
+    }
+  }
+  for (; i < last_valid_row; i++)
+  {
+    unsigned int j = 0;
+    auto colptr = base_ptr;
+    base_ptr += ld_row;
+
+    for (; j < pad_left; j++)
+    {
+      *(dest++) = pad_buffer;
+    }
+    for (; j < last_valid_col; j++)
+    {
+      *(dest++) = colptr;
+      colptr += ld_col;
+    }
+    for (; j < array_cols; j++)
+    {
+      *(dest++) = pad_buffer;
+    }
+  }
+  for (; i < array_rows; i++)
+  {
+    for (unsigned int j = 0; j < array_cols; j++)
+    {
+      *(dest++) = pad_buffer;
+    }
+  }
+}
+
+
+void fill_pointer_array_generic_kernel(
+  const size_t element_size,
+  void **dest_raw,
+  const unsigned int output_rows, const unsigned int output_cols,
+  const unsigned int kernel_rows, const unsigned int kernel_cols,
+  const unsigned int stride_rows, const unsigned int stride_cols,
+  void *base_ptr_raw, size_t ld_row, size_t ld_col,
+  void *pad_buffer_raw,
+  const unsigned int pad_top, const unsigned int valid_rows,
+  const unsigned int pad_left, const unsigned int valid_cols
+)
+{
+  auto dest = reinterpret_cast<char **>(dest_raw);
+  auto base_ptr = reinterpret_cast<char *>(base_ptr_raw);
+  auto pad_buffer = reinterpret_cast<char *>(pad_buffer_raw);
+  ld_row *= element_size;
+  ld_col *= element_size;
+
+  const auto last_valid_row = pad_top + valid_rows;
+  const auto last_valid_col = pad_left + valid_cols;
+  const auto point_stride = output_rows * output_cols;
+
+  // Iterate over the output points, after every point increment the pointer
+  // into the address array.
+  for (unsigned int oi = 0; oi < output_rows; oi++)
+  {
+    for (unsigned int oj = 0; oj < output_cols; oj++)
+    {
+      auto point_dest = dest;
+      dest++;
+
+      // Iterate over kernel points and fill in the pointer array.
+      unsigned int ki = 0, ii = oi*stride_rows;
+      for (; ii < pad_top && ki < kernel_rows; ii++, ki++)
+      {
+        // Fill with padding
+        for (unsigned int j = 0; j < kernel_cols; j++)
+        {
+          *point_dest = pad_buffer;
+          point_dest += point_stride;
+        }
+      }
+      for (; ii < last_valid_row && ki < kernel_rows; ii++, ki++)
+      {
+        unsigned int kj = 0, ij = oj*stride_cols;
+        for (; ij < pad_left && kj < kernel_cols; ij++, kj++)
+        {
+          // Padding
+          *point_dest = pad_buffer;
+          point_dest += point_stride;
+        }
+        for (; ij < last_valid_col && kj < kernel_cols; ij++, kj++)
+        {
+          *point_dest = base_ptr + (ii - pad_top)*ld_row + (ij - pad_left)*ld_col;
+          point_dest += point_stride;
+        }
+        for (; kj < kernel_cols; kj++)
+        {
+          // Padding
+          *point_dest = pad_buffer;
+          point_dest += point_stride;
+        }
+      }
+      for (; ki < kernel_rows; ki++)
+      {
+        // Fill with padding
+        for (unsigned int j = 0; j < kernel_cols; j++)
+        {
+          *point_dest = pad_buffer;
+          point_dest += point_stride;
+        }
+      }
+    }
+  }
+}
+
+/* Patch array constructor
+ *
+ * Some depthwise kernels require an NCHW-ordered patch of input. Here we
+ * construct such a patch, and fill in an array of pointers to the rows of the
+ * patch.
+ */
+void fill_nchw_patch_array(
+  size_t element_size,
+  const void **dest_row_pointers_raw,  // Array of pointers to each row of the patch
+  void *dest_patch_raw,  // Pointer to space which can be used to construct the patch
+  const unsigned int patch_rows, unsigned int patch_cols,  // Patch size
+  const void *src_ptr_raw, size_t ld_row, size_t ld_col,  // Source tensor
+  const void *pad_row,  // Pointer to a row of padding values
+  const unsigned int pad_top, const unsigned int valid_rows,
+  const unsigned int pad_left, const unsigned int valid_cols
+)
+{
+  // Convert into more useful types
+  auto row_pointers = reinterpret_cast<const char **>(dest_row_pointers_raw);
+  auto dest_patch = reinterpret_cast<char *>(dest_patch_raw);
+  auto src = reinterpret_cast<const char *>(src_ptr_raw);
+  ld_row *= element_size;
+  ld_col *= element_size;
+
+  // Round up the patch columns to be a full quad
+  patch_cols = arm_gemm::roundup<unsigned int>(patch_cols, 16 / element_size);
+
+  const auto last_valid_row = std::min(pad_top + valid_rows, patch_rows);
+  const auto last_valid_col = std::min(pad_left + valid_cols, patch_cols);
+
+  // Construct the patch and row pointer array together
+  unsigned int i = 0;
+  for (; i < pad_top; i++)
+  {
+    // Insert pointers into the padding row
+    *(row_pointers++) = reinterpret_cast<const char *>(pad_row);
+  }
+  for (; i < last_valid_row; i++)
+  {
+    // Get a copy of the pointer for this row
+    auto colptr = src;
+    src += ld_row;
+
+    // If the input is already in NCHW format (ld_col == element_size) AND
+    // there is no padding, then we just use a pointer to the source tensor;
+    // otherwise we need to construct a patch and provide a pointer to it.
+    if (ld_col == element_size && pad_left == 0 && last_valid_col == patch_cols)
+    {
+      *(row_pointers++) = colptr;
+    }
+    else
+    {
+      auto patch_col = dest_patch;
+      *(row_pointers++) = dest_patch;
+      dest_patch += element_size * patch_cols;  // Move the patch pointer on
+
+      // Construct the patch; fill the entirety with padding and then copy in
+      // the valid elements.
+      memcpy(patch_col, pad_row, element_size * patch_cols);
+      patch_col += pad_left * element_size;  // Move over the left padding
+
+      if (ld_col == element_size)
+      {
+        // If the input is NCHW then copy across as many columns as we can.
+        memcpy(patch_col, colptr, (last_valid_col - pad_left) * element_size);
+      }
+      else
+      {
+        // If the input is NHWC then copy columns across in turn.
+        for (auto j = pad_left; j < last_valid_col; j++)
+        {
+          memcpy(patch_col, colptr, element_size);  // Copy the valid element
+          patch_col += element_size;  // Progress the patch destination
+          colptr += ld_col;  // Progress the patch source
+        }
+      }
+    }
+  }
+  for (; i < patch_rows; i++)
+  {
+    // Insert pointers into the padding row
+    *(row_pointers++) = reinterpret_cast<const char *>(pad_row);
+  }
+}
+
+
+/* Patch array constructor (generic kernels)
+ *
+ * Construct an array of pointers; one pointer for each output row for each
+ * kernel point. Pointers should point at a whole number of QUADS containing an
+ * input point for each output point. If the kernel column stride is 1 and the
+ * data is NCHW then the input tensor might be addressed directly, otherwise a
+ * new patch sample might need to be constructed.
+ */
+void fill_patch_array_generic_kernel(
+  size_t element_size,
+  const void **dest_pointers_raw,  // Pointers: one per output row per kernel point
+  void *patch_raw,  // Pointer to space which can be used to construct the patch
+  const unsigned int output_rows, const unsigned int output_cols,
+  const unsigned int kernel_rows, const unsigned int kernel_cols,
+  const unsigned int stride_rows, const unsigned int stride_cols,
+  const void *src_ptr_raw, size_t ld_row, size_t ld_col,  // Source tensor
+  const void *pad_row,  // Pointer to a row of padding values
+  const unsigned int pad_top, const unsigned int valid_rows,
+  const unsigned int pad_left, const unsigned int valid_cols
+)
+{
+  auto dest = reinterpret_cast<const char **>(dest_pointers_raw);
+  auto patch = reinterpret_cast<char *>(patch_raw);
+  auto src_ptr = reinterpret_cast<const char *>(src_ptr_raw);
+  ld_row *= element_size;
+  ld_col *= element_size;
+
+  // Round up the patch columns to a multiple of quad-length
+  const auto patch_cols = arm_gemm::roundup<unsigned int>(output_cols, 16 / element_size);
+
+  const auto input_rows = kernel_rows + (output_rows - 1) * stride_rows;
+  const auto last_valid_row = std::min(pad_top + valid_rows, input_rows);
+
+  const auto input_cols = kernel_cols + (output_cols - 1) * stride_cols;
+  const auto last_valid_col = std::min(pad_left + valid_cols, input_cols);
+
+  for (auto ki = 0u; ki < kernel_rows; ki++)
+  {
+    for (auto kj = 0u; kj < kernel_cols; kj++)
+    {
+      auto oi = 0u, ii = ki;
+      for (; oi < output_rows && ii < pad_top; oi++, ii += stride_rows)
+      {
+        // Insert a pointer to the padding row
+        *(dest++) = reinterpret_cast<const char *>(pad_row);
+      }
+      for (; oi < output_rows && ii < last_valid_row; oi++, ii += stride_rows)
+      {
+        auto rowptr = src_ptr + (ii - pad_top) * ld_row;
+
+        // Construct a sample of the input here
+        auto patch_pos = patch;
+        *(dest++) = patch;
+        patch += patch_cols * element_size;
+
+        // Fill with padding
+        memcpy(patch_pos, pad_row, patch_cols * element_size);
+
+        // Fill in the valid elements
+        auto oj = 0u, ij = kj;
+        for (; oj < patch_cols && ij < pad_left; oj++, ij += stride_cols)
+        {
+          // Do nothing for padding
+          patch_pos += element_size;
+        }
+        for (; oj < patch_cols && ij < last_valid_col; oj++, ij += stride_cols)
+        {
+          // Copy from the source tensor
+          memcpy(patch_pos, rowptr + (ij - pad_left)*ld_col, element_size);
+          patch_pos += element_size;
+        }
+        // No action required for right-hand padding
+      }
+      for (; oi < output_rows; oi++)
+      {
+        *(dest++) = reinterpret_cast<const char *>(pad_row);
+      }
+    }
+  }
+}
+
+}  // namespace addressing
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/addressing.hpp b/src/core/NEON/kernels/arm_conv/addressing.hpp
new file mode 100644
index 0000000000..35715a3764
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/addressing.hpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* arm_conv kernels share a lot of similarities in how they address input and
+ * output tensors. Consequently, this file contains common approaches to
+ * preparing these tensor descriptions. Generic (i.e., untyped) methods are
+ * contained within the `arm_conv::addressing` namespace, and typed wrappers
+ * are provided within an anonymous namespace within `arm_conv`. The various
+ * methods are described below.
+ */
+
+#include <cstddef>
+
+namespace arm_conv {
+namespace addressing {
+
+/* Pointer array
+ * -------------
+ *
+ * Constructs an array of pointers which point to a `array_rows` x `array_cols`
+ * chunk of a tensor. The array of pointers will be written into `dest`.
+ *
+ * `base_ptr` should point at the first VALID element of the chunk of tensor
+ * (i.e., if there's one padded row, and one padded column, then `base_ptr`
+ * should point at the element which will be at position (1, 1) in the array).
+ * `ld_row` and `ld_col` are in bytes, and describe the strides over rows and
+ * columns (respectively) of the NHWC-ordered tensor. `pad_buffer` should point
+ * at a suitably sized (and initialised) area of memory which can be addressed
+ * by elements of the array which represent padding.
+ *
+ * `pad_top` and `pad_left` describe the padding on the top and left of the
+ * array, respectively, and `valid_rows` and `valid_cols` describe the number
+ * of rows and columns between the element pointed to by `base_ptr` and the
+ * edge of the image (that is `valid_rows` may be greater than `array_rows` and
+ * likewise for the columns).
+ */
+void fill_pointer_array(
+  size_t element_size,
+  void **dest, unsigned int array_rows, unsigned int array_cols,
+  void *base_ptr, size_t ld_row, size_t ld_col,
+  void *pad_buffer,
+  unsigned int pad_top, unsigned int valid_rows,
+  unsigned int pad_left, unsigned int valid_cols
+);
+
+/* Interleaved multi-point pointer array
+ * -------------------------------------
+ *
+ * For each point in a `output_rows` x `output_cols` array, constructs
+ * `kernel_rows` x `kernel_cols` array of pointers. The pointers are
+ * interleaved thusly:
+ *
+ *   for ki in kernel_rows:
+ *       for kj in kernel_cols:
+ *           for oi in output_rows:
+ *               for oj in output_cols:
+ *                   get pointer for point (oi*stride_rows + ki, oj*stride_cols + kj)
+ *
+ * Other arguments are as for `fill_pointer_array`.
+ *
+ * The name reflects that this is the form of addressing mode used by "generic"
+ * depthwise and pooling kernels.
+ */
+void fill_pointer_array_generic_kernel(
+  size_t element_size,
+  void **dest,
+  unsigned int output_rows, unsigned int output_cols,
+  unsigned int kernel_rows, unsigned int kernel_cols,
+  unsigned int stride_rows, unsigned int stride_cols,
+  void *base_ptr, size_t ld_row, size_t ld_col,
+  void *pad_buffer,
+  unsigned int pad_top, unsigned int valid_rows,
+  unsigned int pad_left, unsigned int valid_cols
+);
+
+/* NCHW-patch addressed by row
+ * ---------------------------
+ *
+ * Construct an array of pointers, each of which points at a row of an
+ * NCHW-ordered patch of a tensor. Memory addressed by the pointers may be
+ * outside of the original tensor, and should therefore not be written to
+ * (modifications will be lost).
+ *
+ * `dest_row_pointers` should point at a `patch_rows` list of pointers; each of
+ * which will point at a 1 x `patch_cols` NCHW-ordered sample of the source
+ * tensor.
+ *
+ * `dest_patch` should point to a `element_size * patch_rows * patch_cols` area
+ * of memory which can be written to by this function to form samples of the
+ * source tensor.
+ *
+ * `src_ptr` should point at the first VALID element of the chunk of tensor
+ * (i.e., if there's one padded row, and one padded column, then `src_ptr`
+ * should point at the element which will be at position (1, 1) in the array).
+ * `ld_row` and `ld_col` are in bytes, and describe the strides over rows and
+ * columns (respectively) of the NHWC-ordered tensor. If `ld_col` ==
+ * `element_size` then copies from the source tensor will be elided and source
+ * data may be addressed directly.
+ *
+ * `pad_row` should point to a `patch_cols` array of (appropriately
+ * initialised) padding values.
+ *
+ * Other arguments are as for `fill_pointer_array`.
+ */
+void fill_nchw_patch_array(
+  size_t element_size,
+  const void **dest_row_pointers,  // Array of pointers to each row of the patch
+  void *dest_patch,  // Pointer to space which can be used to construct the patch
+  unsigned int patch_rows, unsigned int patch_cols,  // Patch size
+  const void *src_ptr, size_t ld_row, size_t ld_col,  // Source tensor
+  const void *pad_row,  // Pointer to a row of padding values
+  unsigned int pad_top, unsigned int valid_rows,
+  unsigned int pad_left, unsigned int valid_cols
+);
+
+void fill_patch_array_generic_kernel(
+  size_t element_size,
+  const void **dest_pointers,  // Pointers: one per output row per kernel point
+  void *dest_patch,  // Pointer to space which can be used to construct the patch
+  unsigned int output_rows, unsigned int output_cols,
+  unsigned int kernel_rows, unsigned int kernel_cols,
+  unsigned int stride_rows, unsigned int stride_cols,
+  const void *src_ptr, size_t ld_row, size_t ld_col,  // Source tensor
+  const void *pad_row,  // Pointer to a row of padding values
+  unsigned int pad_top, unsigned int valid_rows,
+  unsigned int pad_left, unsigned int valid_cols
+);
+
+}  // namespace addressing
+
+namespace {
+
+/* Pointer array
+ * -------------
+ *
+ * See `addressing::fill_pointer_array`. No copies are made by this method,
+ * memory pointed to by the pointer array is contained within the base tensor
+ * and the padding buffer.
+ */
+template <typename T>
+inline void fill_pointer_array(
+  T **dest, unsigned int array_rows, unsigned int array_cols,
+  T *base_ptr, size_t ld_row, size_t ld_col,
+  T *pad_buffer,
+  unsigned int pad_top, unsigned int valid_rows,
+  unsigned int pad_left, unsigned int valid_cols
+)
+{
+  addressing::fill_pointer_array(
+    sizeof(T), (void **) dest, array_rows, array_cols,
+    (void *) base_ptr, ld_row, ld_col,
+    (void *) pad_buffer,
+    pad_top, valid_rows,
+    pad_left, valid_cols
+  );
+}
+
+
+/* Interleaved multi-point pointer array
+ * -------------------------------------
+ *
+ * See `addressing::fill_pointer_array_generic_kernel`. No copies are made by
+ * this method, memory pointed to by the pointer array is contained within the
+ * base tensor and the padding buffer.
+ */
+template <typename T>
+inline void fill_pointer_array_generic_kernel(
+  T **dest,
+  unsigned int output_rows, unsigned int output_cols,
+  unsigned int kernel_rows, unsigned int kernel_cols,
+  unsigned int stride_rows, unsigned int stride_cols,
+  T *base_ptr, size_t ld_row, size_t ld_col,
+  T *pad_buffer,
+  unsigned int pad_top, unsigned int valid_rows,
+  unsigned int pad_left, unsigned int valid_cols
+)
+{
+  addressing::fill_pointer_array_generic_kernel(
+    sizeof(T),
+    (void **) dest,
+    output_rows, output_cols,
+    kernel_rows, kernel_cols,
+    stride_rows, stride_cols,
+    (void *) base_ptr, ld_row, ld_col,
+    (void *) pad_buffer,
+    pad_top, valid_rows,
+    pad_left, valid_cols
+  );
+}
+
+template <typename T>
+inline void fill_nchw_patch_array(
+  const T **dest_row_pointers,  // Array of pointers to each row of the patch
+  T *dest_patch,  // Pointer to space which can be used to construct the patch
+  unsigned int patch_rows, unsigned int patch_cols,  // Patch size
+  const T *src_ptr, size_t ld_row, size_t ld_col,  // Source tensor
+  const T *pad_row,  // Pointer to a row of padding values
+  unsigned int pad_top, unsigned int valid_rows,
+  unsigned int pad_left, unsigned int valid_cols
+)
+{
+  addressing::fill_nchw_patch_array(
+    sizeof(T),
+    reinterpret_cast<const void **>(dest_row_pointers),
+    reinterpret_cast<void *>(dest_patch),
+    patch_rows, patch_cols,
+    reinterpret_cast<const void *>(src_ptr), ld_row, ld_col,
+    reinterpret_cast<const void *>(pad_row),
+    pad_top, valid_rows,
+    pad_left, valid_cols
+  );
+}
+
+template <typename T>
+inline void fill_patch_array_generic_kernel(
+  const T **dest_pointers,  // Pointers: one per output row per kernel point
+  T *dest_patch,  // Pointer to space which can be used to construct the patch
+  unsigned int output_rows, unsigned int output_cols,
+  unsigned int kernel_rows, unsigned int kernel_cols,
+  unsigned int stride_rows, unsigned int stride_cols,
+  const T *src_ptr, size_t ld_row, size_t ld_col,  // Source tensor
+  const T *pad_row,  // Pointer to a row of padding values
+  unsigned int pad_top, unsigned int valid_rows,
+  unsigned int pad_left, unsigned int valid_cols
+)
+{
+  addressing::fill_patch_array_generic_kernel(
+    sizeof(T),
+    reinterpret_cast<const void **>(dest_pointers),
+    reinterpret_cast<void *>(dest_patch),
+    output_rows, output_cols,
+    kernel_rows, kernel_cols,
+    stride_rows, stride_cols,
+    reinterpret_cast<const void *>(src_ptr), ld_row, ld_col,
+    reinterpret_cast<const void *>(pad_row),
+    pad_top, valid_rows,
+    pad_left, valid_cols
+  );
+}
+
+}  // namespace {anonymous}
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
new file mode 100644
index 0000000000..95ece8cdc8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthwise.hpp"
+#include "utils.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename T> struct DefaultTAccum { using Type = T; };
+template <> struct DefaultTAccum<int8_t> { using Type = int32_t; };
+template <> struct DefaultTAccum<uint8_t> { using Type = int32_t; };
+
+template <typename T> struct DefaultOutputStage { using Type = Nothing; };
+template <> struct DefaultOutputStage<int8_t> { using Type = arm_gemm::Requantize32; };
+template <> struct DefaultOutputStage<uint8_t> { using Type = arm_gemm::Requantize32; };
+
+class IDepthfirstStrategy
+{
+  public:
+  virtual ~IDepthfirstStrategy() = default;
+
+  virtual unsigned int get_input_rows() const = 0;
+  virtual unsigned int get_input_cols() const = 0;
+
+  virtual unsigned int get_output_rows() const = 0;
+  virtual unsigned int get_output_cols() const = 0;
+};
+
+
+template <typename T>
+struct TensorSpec
+{
+  T base;
+  size_t ld_row, ld_col;
+
+  TensorSpec(T ptr, size_t ld_row, size_t ld_col)
+  : base(ptr), ld_row(ld_row), ld_col(ld_col) {}
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput>
+class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
+{
+  protected:
+  using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
+
+  // The strategy which we're applying to solve the depthwise convolution.
+  std::unique_ptr<const IDepthfirstStrategy> m_strat;
+
+  /* Compute the amount of working space required for a single thread. */
+  virtual size_t get_working_size_per_thread() const = 0;
+
+  /* Initialise the working space for a thread. */
+  virtual void initialise_working_space(void *) const = 0;
+
+  /* Compute a portion of the output tensor with padding. */
+  virtual void compute_tile_padded(
+    const DepthwiseArgs &args,
+    unsigned int output_i, unsigned int output_j,
+    unsigned int output_channel_start, unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    const void *parameters,
+    void *working_space
+  ) const = 0;
+
+  /* Compute a portion of the work with only top/bottom padding.
+   *
+   * The default implementation of this repeatedly calls into the padded tile
+   * variant.
+   */
+  virtual void compute_row_padded_tile_row(
+    const DepthwiseArgs &args,
+    const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
+    const unsigned int output_channel_start, const unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    const void *parameters,
+    void *working_space
+  ) const
+  {
+    for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols())
+    {
+      this->compute_tile_padded(
+        args,
+        output_i, output_j, output_channel_start, output_channel_end,
+        input, output, parameters, working_space
+      );
+    }
+  }
+
+  /* Compute a portion of the output tensor with no padding.
+   *
+   * The default implementation of this repeatedly calls into the padded
+   * variant.
+   */
+  virtual void compute_tiles_unpadded(
+    const DepthwiseArgs &args,
+    unsigned int start_output_i, unsigned int start_output_j,
+    unsigned int n_tile_rows, unsigned int n_tile_cols,
+    unsigned int output_channel_start, unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    const void *parameters,
+    void *working_space
+  ) const
+  {
+    for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
+    {
+      unsigned int row_start_output_j = start_output_j;
+      for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)
+      {
+        this->compute_tile_padded(
+            args,
+            start_output_i, row_start_output_j,
+            output_channel_start, output_channel_end,
+            input, output, parameters, working_space
+        );
+        row_start_output_j += m_strat->get_output_cols();
+      }
+      start_output_i += m_strat->get_output_rows();
+    }
+  }
+
+  void execute_internal(
+    const DepthwiseArgs &args,
+    const void *input,
+    size_t ld_input_col,
+    size_t ld_input_row,
+    size_t ld_input_batch,
+    const void *parameters,
+    void *output,
+    size_t ld_output_col,
+    size_t ld_output_row,
+    size_t ld_output_batch,
+    void *working_space,
+    unsigned int thread_id,
+    unsigned int n_threads
+  ) const override
+  {
+    // Get and initialise the working space for this thread.
+    void *thread_working_space =
+      static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
+    this->initialise_working_space(thread_working_space);
+
+    // Construct convenient representations of the input/output tensors.
+    TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col);
+    TensorSpec<TOutput *> output_tensor(reinterpret_cast<TOutput *>(output), ld_output_row, ld_output_col);
+
+    const auto n_output_channels = args.input_channels * args.channel_multiplier;
+
+    // By default we parallelize over the rows, but if there's only 1 row, we
+    // try to parallize over batches
+    auto thread_id_for_rows = thread_id;
+    auto n_threads_for_rows = n_threads;
+    auto thread_id_for_batches = 0;
+    auto n_threads_for_batches = 1;
+    if (args.output_rows == 1) {
+      thread_id_for_rows = 0;
+      n_threads_for_rows = 1;
+      thread_id_for_batches = thread_id;
+      n_threads_for_batches = n_threads;
+    }
+
+    // Progress the pointers for the first batch.
+    input_tensor.base += ld_input_batch*thread_id_for_batches;
+    output_tensor.base += ld_output_batch*thread_id_for_batches;
+    for (unsigned int batch = thread_id_for_batches;
+          batch < args.n_batches;
+          batch += n_threads_for_batches)
+    {
+      // Iterate over rows of the output tensor; we stripe over the tiles.
+      for (unsigned int start_output_i = thread_id_for_rows * m_strat->get_output_rows();
+           start_output_i < args.output_rows;
+           start_output_i += n_threads_for_rows * m_strat->get_output_rows())
+      {
+        // Determine what (if any padding) is required on the top/bottom of
+        // this row of the convolution.
+        const auto end_output_i = start_output_i + m_strat->get_output_rows();
+        const bool pad_output_bottom = args.output_rows < end_output_i;
+
+        const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
+        const bool pad_input_top = start_input_i < 0;
+        const int end_input_i = start_input_i + m_strat->get_input_rows();
+        const bool pad_input_bottom = static_cast<int>(args.input_rows) < end_input_i;
+        // We only need to account for input padding if direct padding is not supported.
+        const bool pad_row = ((pad_input_top || pad_input_bottom) && !this->supports_direct_padding())
+                || pad_output_bottom;
+
+        // Iterate over the columns of the output tensor; we attempt to grab as
+        // much as possible of the unpadded regions, so the loop structure is a
+        // bit odd.
+        unsigned int start_output_j = 0;
+        while (start_output_j < args.output_cols)
+        {
+          const int start_in_j = start_output_j * args.stride_cols - args.padding.left;
+          const bool pad_input_left = start_in_j < 0;
+
+          // Determine if we can process a number of unpadded tiles in one go.
+          int n_unpadded_tiles = 0;
+          if ((!pad_input_left) || this->supports_direct_padding())
+          {
+            // Determine the maximum number of tiles we could handle.
+            n_unpadded_tiles = (args.output_cols - start_output_j) / m_strat->get_output_cols();
+
+            // Handle padding on the right hand edge
+            const int tile_stride = m_strat->get_output_cols() * args.stride_cols;
+            int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols();
+            int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride;
+
+            while (n_unpadded_tiles > 0 &&
+                   (static_cast<int>(args.output_cols) < end_output_j ||
+                    static_cast<int>(args.input_cols) < end_input_j))
+            {
+              n_unpadded_tiles--;
+              end_output_j -= m_strat->get_output_cols();
+              end_input_j -= tile_stride;
+            }
+          }
+
+          // Process unpadded tiles, if possible, otherwise process a padded tile.
+          if (n_unpadded_tiles)
+          {
+            if (!pad_row)
+            {
+              // Completely unpadded execution
+              this->compute_tiles_unpadded(
+                args,
+                start_output_i, start_output_j,
+                1, n_unpadded_tiles,  // Compute a row of unpadded tiles
+                0, n_output_channels,  // Compute all channels
+                input_tensor, output_tensor, parameters, thread_working_space
+              );
+            }
+            else
+            {
+              // Top/bottom padding only
+              this->compute_row_padded_tile_row(
+                args,
+                start_output_i, start_output_j, n_unpadded_tiles,
+                0, n_output_channels,  // Compute all channels
+                input_tensor, output_tensor, parameters, thread_working_space
+              );
+            }
+            start_output_j += n_unpadded_tiles * m_strat->get_output_cols();
+          }
+          else
+          {
+            this->compute_tile_padded(
+              args,
+              start_output_i, start_output_j,
+              0, n_output_channels,  // Compute all channels
+              input_tensor, output_tensor, parameters, thread_working_space
+            );
+            start_output_j += m_strat->get_output_cols();
+          }
+        }
+      }
+
+      // Progress the pointers for the next batch.
+      input_tensor.base += ld_input_batch*n_threads_for_batches;
+      output_tensor.base += ld_output_batch*n_threads_for_batches;
+    }
+  }
+
+  public:
+  DepthfirstDriver(IDepthfirstStrategy *strategy, const DepthwiseArgs &args)
+  : Parent(args), m_strat(strategy)
+  {
+  }
+
+  size_t get_working_size(unsigned int n_threads) const override final
+  {
+    return n_threads * this->get_working_size_per_thread();
+  }
+
+  virtual bool supports_direct_padding() const
+  {
+    return false;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp
new file mode 100644
index 0000000000..2950d5e957
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "depthwise_common.hpp"
+
+#include "utils.hpp"
+
+using arm_gemm::iceildiv;
+
+namespace arm_conv {
+namespace depthwise {
+
+std::tuple<size_t, size_t, size_t, size_t, size_t>
+get_reduced_view_for_dilation(size_t out_size, size_t in_size, const size_t d,
+                              const size_t dilation_factor,
+                              const size_t kernel_size, const size_t stride,
+                              const size_t orig_pad_before) {
+    // Get the valid output range
+    out_size = iceildiv(out_size - d, dilation_factor);
+
+    // Compute the start offset and the amount of padding which applies to this
+    // portion of the work.
+    size_t start_pos = d * stride, pad_before = 0;
+    if (start_pos < orig_pad_before) {
+        pad_before = iceildiv(orig_pad_before - start_pos, dilation_factor);
+    }
+    start_pos += pad_before * dilation_factor - orig_pad_before;
+
+    // Hence compute the valid input range
+    in_size = start_pos < in_size
+                  ? iceildiv(in_size - start_pos, dilation_factor)
+                  : 0;
+
+    // Finally, compute the "after" padding
+    const size_t reqd_input = (out_size - 1) * stride + kernel_size;
+    size_t pad_after = 0;
+    if (reqd_input > (pad_before + in_size)) {
+        pad_after = reqd_input - (pad_before + in_size);
+    }
+
+    return std::make_tuple(out_size, in_size, start_pos, pad_before, pad_after);
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
new file mode 100644
index 0000000000..7b00c9a7af
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
@@ -0,0 +1,700 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/arm_conv/addressing.hpp"
+#include "depthwise_strategies_common.hpp"
+#include "working_space.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+#include <limits>
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum,
+          typename OutputStage>
+class DepthwiseDepthfirstStrategyCommon
+  : public DepthfirstStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+  protected:
+  unsigned int m_output_rows, m_output_cols;
+  unsigned int m_kernel_rows, m_kernel_cols;
+  unsigned int m_stride_rows, m_stride_cols;
+
+  public:
+  DepthwiseDepthfirstStrategyCommon(
+    unsigned int output_rows, unsigned int output_cols,
+    unsigned int kernel_rows, unsigned int kernel_cols,
+    unsigned int stride_rows=1, unsigned int stride_cols=1
+  ) : m_output_rows(output_rows), m_output_cols(output_cols),
+      m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
+      m_stride_rows(stride_rows), m_stride_cols(stride_cols)
+  {
+  }
+
+  DepthwiseDepthfirstStrategyCommon(unsigned int output_size, unsigned int kernel_size, unsigned int stride=1)
+  : DepthwiseDepthfirstStrategyCommon(output_size, output_size, kernel_size, kernel_size, stride, stride)
+  {
+  }
+
+  virtual ~DepthwiseDepthfirstStrategyCommon() {}
+
+  unsigned int get_output_rows() const override { return m_output_rows; }
+  unsigned int get_output_cols() const override { return m_output_cols; }
+
+  unsigned int get_kernel_rows() const override { return m_kernel_rows; }
+  unsigned int get_kernel_cols() const override { return m_kernel_cols; }
+
+  unsigned int get_stride_rows() const override { return m_stride_rows; }
+  unsigned int get_stride_cols() const override { return m_stride_cols; }
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwiseDepthfirstStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+  using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>;
+
+  public:
+  using Parent::Parent;
+
+  typedef void (*IndirectKernelType)(
+    const TInput *const *input_ptrs,
+    TOutput *const *output_ptrs,
+    const void *params,
+    unsigned int n_channels,
+    const TAccum activation_min,
+    const TAccum activation_max
+  );
+  virtual IndirectKernelType get_indirect_kernel(void) const = 0;
+
+  typedef void (*DirectKernelType)(
+    const unsigned int n_tile_rows, const unsigned int n_tile_cols,
+    const TInput *inptr_base, int64_t ld_input_row, int64_t ld_input_col,
+    TOutput *outptr_base, int64_t ld_output_row, int64_t ld_output_col,
+    const void *params, unsigned int n_channels,
+    const TAccum activation_min,
+    const TAccum activation_max
+  );
+  virtual DirectKernelType get_direct_kernel(void) const = 0;
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+class DepthwiseDepthfirstStrategy<TInput, TWeight, TOutput, int32_t>
+: public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+  using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
+
+  protected:
+  interleaves::PackingArguments get_packing_args(void) const
+  {
+    return interleaves::PackingArguments(
+      this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+      false, sizeof(int32_t), this->uses_premultiply(),  // Don't pack the bias
+      this->get_vl_type(), sizeof(int32_t), this->get_accumulator_depth_vl(),
+      [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+      { return this->get_kernel_packing_point(idx, x, y); }
+    );
+  }
+
+  public:
+  using Parent::Parent;
+
+  typedef void (*KernelType)(
+    unsigned int,  //  n_channels,
+    const TInput *const *,  // inptrs
+    const TWeight *,  // weights
+    const int32_t *,  //  bias,
+    const arm_gemm::Requantize32 &,
+    const int32_t *, const int32_t *,  //  requant_muls and requant_shifts
+    TOutput *const *  // outptrs
+  );
+  virtual KernelType get_kernel() const = 0;
+
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleaves::get_storage_size_generic(get_packing_args(), args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer,
+    const void *biases, const arm_gemm::Requantize32 &,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleaves::pack_parameters_generic(
+      get_packing_args(), args, buffer, biases, weights, ld_weight_col, ld_weight_row);
+  }
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+class DepthwiseDepthfirstCommon : public DepthfirstDriver<TInput, TWeight, TOutput>
+{
+  using StratType = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>;
+  OutputStage m_os;
+
+  protected:
+  inline OutputStage &get_output_stage(void) { return m_os; }
+  inline const OutputStage &get_output_stage(void) const { return m_os; }
+
+  bool uses_intermediate_array() const
+  {
+    return this->m_args.channel_multiplier != 1 && this->uses_premultiply();
+  }
+
+  virtual void fill_inptr_array(const DepthwiseArgs &args,
+    const TensorSpec<const TInput *> &input,
+    const TInput **inptr_array, TInput *input_buffer,
+    const unsigned int input_i, const unsigned int input_j,
+    const unsigned int input_pad_top, const unsigned int input_pad_left) const = 0;
+
+  void initialise_inptr_array(const DepthwiseArgs &args,
+      unsigned int output_channel_start, unsigned int output_channel_end,
+      const TensorSpec<const TInput *> &input,
+      const TInput **inptr_array, TInput *input_buffer, TInput *intermediate_buffer,
+      const unsigned int input_i, const unsigned int input_j,
+      const unsigned int input_pad_top, const unsigned int input_pad_left,
+      Tile<TInput> &multiplied_input
+  ) const
+  {
+    // Compute the input pointer array
+    const auto input_channel_start = output_channel_start / args.channel_multiplier;
+
+    const auto last_valid_row = std::min(input_pad_top + args.input_rows - input_i, this->m_strat->get_input_rows());
+    const auto last_valid_col = std::min(input_pad_left + args.input_cols - input_j, this->m_strat->get_input_cols());
+
+    const auto tile_rows = last_valid_row - input_pad_top;
+    const auto tile_cols = last_valid_col - input_pad_left;
+
+    const auto tile_channels = output_channel_end - output_channel_start;
+
+    TensorSpec<const TInput *> tile_tensor(0, 0, 0);
+    if (this->uses_intermediate_array()) {
+      multiplied_input = Tile<TInput>(intermediate_buffer, tile_rows, tile_cols, tile_channels);
+      multiplied_input.load_from(input.base, input.ld_row, input.ld_col,
+                                 args.input_rows, args.input_cols,
+                                 input_i, input_j, args.channel_multiplier);
+
+      tile_tensor = TensorSpec<const TInput *>(
+        multiplied_input.array,
+        tile_cols * tile_channels, tile_channels
+      );
+    } else {
+      tile_tensor = TensorSpec<const TInput *>(
+        input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start,
+        input.ld_row, input.ld_col
+      );
+    }
+
+    fill_inptr_array(args,
+      tile_tensor,
+      inptr_array, input_buffer,
+      input_i, input_j,
+      input_pad_top,
+      input_pad_left
+    );
+  }
+
+  public:
+  DepthwiseDepthfirstCommon(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os)
+  : DepthfirstDriver<TInput, TWeight, TOutput>(strat, args), m_os(os)
+  {
+  }
+
+  DepthwiseDepthfirstCommon(DepthwiseDepthfirstCommon &) = delete;
+  DepthwiseDepthfirstCommon &operator=(DepthwiseDepthfirstCommon &) = delete;
+
+  size_t get_storage_size(void) const override
+  {
+    return reinterpret_cast<const StratType *>(this->m_strat.get())->
+      get_storage_size(this->m_args);
+  }
+
+  void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+  {
+    reinterpret_cast<const StratType *>(this->m_strat.get())->
+      pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row);
+  }
+};
+
+namespace depthwise_depthfirst {
+
+/* Workspace Element for an array of input pointers as consumed by the
+ * specialised depthwise kernels.
+ */
+template <typename T>
+class InputArrayElement
+{
+  public:
+  struct Workspace
+  {
+    const T **inptr_array;
+  };
+
+  template <class OutputStage>
+  static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    return sizeof(T **) * args.strategy->get_input_rows() * args.strategy->get_input_cols();
+  }
+
+  template <class WorkspaceType, class OutputStage>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    ws->inptr_array = reinterpret_cast<const T**>(buffer);
+    return reinterpret_cast<char *>(buffer) + get_element_size(args);
+  }
+};
+
+template <typename TAccum, typename OutputStage, bool IsDot=false>
+struct WorkspaceFinalElement
+{
+  using Element = ActivationsElement<TAccum, OutputStage>;
+};
+
+template <>
+struct WorkspaceFinalElement<int32_t, arm_gemm::Requantize32, false>
+{
+  using Element = RequantizationParametersElement;
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+struct Invoke
+{
+  constexpr static bool supports_direct_kernel = true;
+
+  template <typename Strat, typename Workspace>
+  static inline void indirect(const Strat *strat, const Workspace *ws, const OutputStage &, const void *params, const TAccum *, unsigned int n_channels)
+  {
+    strat->get_indirect_kernel()(
+      ws->inptr_array,
+      ws->outptr_array,
+      params, n_channels,
+      ws->activation_min, ws->activation_max
+    );
+  }
+
+  template <typename Strat, typename Workspace>
+  static void direct(
+    const Strat *strat, const Workspace *ws, const OutputStage &,
+    unsigned int n_tile_rows, unsigned int n_tile_cols,
+    const TInput *inptr, size_t ld_in_row, size_t ld_in_col,
+    TOutput *outptr, size_t ld_out_row, size_t ld_out_col,
+    const void *params, unsigned int n_channels
+  )
+  {
+    strat->get_direct_kernel()(
+      n_tile_rows, n_tile_cols,
+      inptr, ld_in_row, ld_in_col,
+      outptr, ld_out_row, ld_out_col,
+      params, n_channels, ws->activation_min, ws->activation_max
+    );
+  }
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+struct Invoke<TInput, TWeight, TOutput, TAccum, arm_gemm::Requantize32>
+{
+  constexpr static bool supports_direct_kernel = false;
+
+  template <typename Strat, typename Workspace>
+  static inline void indirect(const Strat *strat, const Workspace *ws, const arm_gemm::Requantize32 &qp, const void *params, const TAccum *, unsigned int n_channels)
+  {
+    strat->get_kernel()(
+      n_channels, ws->inptr_array,
+      reinterpret_cast<const TWeight *>(params), ws->bias,
+      qp, ws->requant_muls, ws->requant_shifts,
+      ws->outptr_array
+    );
+  }
+
+  template <typename Strat, typename Workspace>
+  static inline void direct(
+    const Strat *, const Workspace *, const arm_gemm::Requantize32 &,
+    unsigned int, unsigned int,  // n_tile_rows, n_tile_cols
+    const TInput *, size_t, size_t,  // Input pointer, row stride, column stride
+    TOutput *, size_t, size_t,  // Output pointer, row stride, column stride
+    const void *, unsigned int  // Parameters, number of channels
+  )
+  {
+    // Do nothing - this should never be reached because entry to it is guarded
+    // by an `if` on a `constexpr static bool`.
+  }
+};
+
+namespace
+{
+
+template <typename OutputStage>
+inline void stash_bias(OutputStage &, const void *) {}
+
+template <>
+inline void stash_bias(arm_gemm::Requantize32 &qp, const void *bias) __attribute__ ((unused));
+
+template <>
+inline void stash_bias(arm_gemm::Requantize32 &qp, const void *bias)
+{
+  qp.bias = reinterpret_cast<const int32_t *>(bias);
+}
+
+}
+
+}  // namespace depthwise_depthfirst
+
+template <typename TInput,
+          typename TWeight=TInput,
+          typename TOutput=TInput,
+          typename TAccum=typename DefaultTAccum<TInput>::Type,
+          typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwiseDepthfirst
+: public DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+  using StratType = DepthwiseDepthfirstStrategy<TInput, TWeight, TOutput, TAccum>;
+  using Parent = DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>;
+  using WorkspaceManager = Workspace<
+    OutputArrayElement<TOutput>,
+    depthwise_depthfirst::InputArrayElement<TInput>,
+    InputBufferElement<TInput>,
+    IntermediateBufferElement<TInput>,
+    typename depthwise_depthfirst::WorkspaceFinalElement<TAccum, OutputStage>::Element
+  >;
+  using WorkingSpace = typename WorkspaceManager::WorkspaceType;
+
+  // We keep a copy of the bias and output stage
+  const TAccum *m_bias;
+
+  public:
+  DepthwiseDepthfirst(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
+  : Parent(strat, args, os), m_bias(nullptr)
+  {
+  }
+
+  DepthwiseDepthfirst(DepthwiseDepthfirst &) = delete;
+  DepthwiseDepthfirst &operator=(DepthwiseDepthfirst &) = delete;
+
+  void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+  {
+    reinterpret_cast<const StratType *>(this->m_strat.get())->pack_parameters(
+      this->m_args, buffer, biases, this->get_output_stage(),
+      weights, ld_weight_col, ld_weight_row
+    );
+    m_bias = reinterpret_cast<const TAccum *>(biases);
+    depthwise_depthfirst::stash_bias(this->get_output_stage(), biases);
+  }
+
+  size_t get_working_size_per_thread() const override
+  {
+    DepthwiseArgs args(this->m_args);
+    return WorkspaceManager::get_sizeof_workspace(
+      WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage())
+    );
+  }
+
+  void initialise_working_space(void *buffer) const override
+  {
+    DepthwiseArgs args(this->m_args);
+    WorkspaceManager::initialise(
+      buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage())
+    );
+  }
+
+  virtual bool supports_direct_padding() const override
+  {
+    using Invoker = depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>;
+    return Invoker::supports_direct_kernel && this->uses_intermediate_array();
+  }
+
+  protected:
+
+  void fill_inptr_array(const DepthwiseArgs &args,
+    const TensorSpec<const TInput *> &input,
+    const TInput **inptr_array, TInput *input_buffer,
+    const unsigned int input_i, const unsigned int input_j,
+    const unsigned int input_pad_top, const unsigned int input_pad_left) const override
+  {
+    fill_pointer_array<const TInput>(
+      inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
+      input.base,
+      input.ld_row, input.ld_col,
+      input_buffer,
+      input_pad_top, args.input_rows - input_i,
+      input_pad_left, args.input_cols - input_j
+    );
+  }
+
+  void compute_tile_padded(
+    const DepthwiseArgs &args,
+    unsigned int output_i, unsigned int output_j,
+    unsigned int output_channel_start, unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    const void *parameters,
+    void *working_space_raw
+  ) const override
+  {
+    // Get the working space
+    auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
+
+    // Compute the input pointer array
+    const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
+    const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+    const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+    const int ij = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
+    const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
+    const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+    Tile<TInput> multiplied_input;
+    this->initialise_inptr_array(args, output_channel_start, output_channel_end, input,
+      ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
+      input_i, input_j, input_pad_top, input_pad_left, multiplied_input);
+
+    // Compute the output pointer array
+    fill_pointer_array(
+      ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+      output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
+      output.ld_row, output.ld_col,
+      ws->output_buffer,
+      0, args.output_rows - output_i, // Top padding, # valid rows
+      0, args.output_cols - output_j  // Left padding, # valid columns
+    );
+
+    // Execute the kernel
+    depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>::indirect(
+      reinterpret_cast<const StratType *>(this->m_strat.get()),
+      ws, this->get_output_stage(), parameters, m_bias, output_channel_end - output_channel_start
+    );
+  }
+
+  void compute_row_padded_tile_row(
+    const DepthwiseArgs &args,
+    const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
+    const unsigned int output_channel_start, const unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    const void *parameters,
+    void *working_space
+  ) const override
+  {
+    using Invoker = depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>;
+    auto ws = reinterpret_cast<WorkingSpace *>(working_space);
+    const auto strat = reinterpret_cast<const StratType *>(this->m_strat.get());
+    const auto os = this->get_output_stage();
+
+    // Compute top and bottom padding; hence fill in the initial pointer arrays.
+    const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
+    const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+
+    const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+    auto input_j = output_j * args.stride_cols - args.padding.left;
+
+    // Valid input rows is the smallest of the input rows that aren't padding for this tile, and the number of rows
+    // available.
+    const auto valid_input_rows = std::min(strat->get_input_rows() - input_pad_top, args.input_rows - input_i);
+    const auto valid_output_rows = std::min(strat->get_output_rows(), args.output_rows - output_i);
+
+    const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * args.stride_cols;
+    const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols();
+
+    Tile<TInput> multiplied_input;
+    this->initialise_inptr_array(args, output_channel_start, output_channel_end, input,
+      ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
+      input_i, input_j, input_pad_top, 0, multiplied_input);
+
+    fill_pointer_array(
+      ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+      output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
+      output.ld_row, output.ld_col,
+      ws->output_buffer,
+      0, args.output_rows - output_i,  // Top padding, # valid rows
+      0, args.output_cols - output_j  // Left padding, # valid columns
+    );
+
+    for (; n_tile_cols; n_tile_cols--)
+    {
+      // Execute the kernel
+      Invoker::indirect(
+        strat, ws, os, parameters, m_bias, output_channel_end - output_channel_start
+      );
+
+      // Update all unpadded pointers
+      if (this->uses_intermediate_array()) {
+        input_j += input_point_stride / input.ld_col;
+        multiplied_input.load_from(input.base,
+          input.ld_row, input.ld_col,
+          args.input_rows, args.input_cols,
+          input_i, input_j, args.channel_multiplier);
+      } else {
+        {
+          auto ptr = ws->inptr_array + strat->get_input_cols() * input_pad_top;
+          for (auto n = input_pad_top; n < (valid_input_rows + input_pad_top); n++)
+          {
+            for (auto m = 0u; m < strat->get_input_cols(); m++)
+            {
+              *(ptr++) += input_point_stride;
+            }
+          }
+        }
+      }
+
+      {
+        auto ptr = ws->outptr_array;
+        for (auto n = 0u; n < valid_output_rows * strat->get_output_cols(); n++)
+        {
+          *(ptr++) += output_point_stride;
+        }
+      }
+    }
+  }
+
+  void compute_tiles_unpadded(
+    const DepthwiseArgs &args,
+    unsigned int output_i, const unsigned int output_j,
+    unsigned int n_tile_rows, unsigned int n_tile_cols,
+    unsigned int output_channel_start, unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    const void *parameters,
+    void *working_space_raw
+  ) const override
+  {
+    using Invoker = depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>;
+    auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
+    const auto strat = reinterpret_cast<const StratType *>(this->m_strat.get());
+    const auto os = this->get_output_stage();
+
+    if (Invoker::supports_direct_kernel)
+    {
+      PaddingValues tile_padding = {
+              args.kernel_cols / 2,
+              args.kernel_rows / 2,
+              args.kernel_cols / 2,
+              args.kernel_rows / 2
+      };
+
+      // If the direct kernel is supported, then use it.
+      // Compute the base pointers we'll use in the tile.
+      auto outptr = output.base + output_channel_start + output_i * output.ld_row + output_j * output.ld_col;
+      const int start_input_i = output_i * args.stride_rows - args.padding.top;
+      const int start_input_j = output_j * args.stride_cols - args.padding.left;
+      auto inptr = input.base + output_channel_start + start_input_i * input.ld_row + start_input_j * input.ld_col;
+
+      auto ld_row = input.ld_row;
+      auto ld_col = input.ld_col;
+
+      const auto tile_rows = this->m_strat->get_output_rows() * args.stride_rows * n_tile_rows + tile_padding.top + tile_padding.bottom;
+      const auto tile_cols = this->m_strat->get_output_cols() * args.stride_cols * n_tile_cols + tile_padding.left + tile_padding.right;
+      const auto tile_channels = output_channel_end - output_channel_start;
+
+      Tile<TInput> multiplied_input;
+      if (this->uses_intermediate_array()) {
+        multiplied_input = Tile<TInput>(ws->intermediate_buffer, tile_rows, tile_cols, tile_channels);
+        multiplied_input.load_from(input.base,
+          input.ld_row, input.ld_col,
+          args.input_rows, args.input_cols,
+          start_input_i, start_input_j, args.channel_multiplier);
+
+        ld_row = tile_cols * tile_channels;
+        ld_col = tile_channels;
+        inptr = multiplied_input.array;
+      }
+
+      // Execute the kernel
+      Invoker::direct(
+        strat, ws, os,
+        n_tile_rows, n_tile_cols,
+        inptr, ld_row, ld_col,
+        outptr, output.ld_row, output.ld_col,
+        parameters, output_channel_end - output_channel_start
+      );
+    }
+    else
+    {
+      // Otherwise, we repeatedly call the padded kernel but use our knowledge
+      // of the tensor structure to avoid recomputing the pointer array.
+
+      const auto n_input_pointers = this->m_strat->get_input_rows() * this->m_strat->get_input_cols();
+      const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * args.stride_cols;
+      const auto n_output_pointers = this->m_strat->get_output_rows() * this->m_strat->get_output_cols();
+      const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols();
+
+      // For each tile row, initialise the input and output pointer arrays. For
+      // each subsequent tile we simply update the pointers.
+      for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
+      {
+        const int input_i = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
+        int input_j = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
+
+        Tile<TInput> multiplied_input;
+        this->initialise_inptr_array(args, output_channel_start, output_channel_end, input,
+          ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
+          input_i, input_j, 0, 0, multiplied_input);
+
+        // Compute the output pointer array
+        fill_pointer_array(
+          ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+          output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
+          output.ld_row, output.ld_col,
+          ws->output_buffer,
+          0, args.output_rows,
+          0, args.output_cols
+        );
+
+        for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)
+        {
+          // Invoke the indirect kernel for this tile
+          depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>::indirect(
+            strat, ws, os, parameters, m_bias, output_channel_end - output_channel_start
+          );
+
+          // Progress the pointers
+          if (this->uses_intermediate_array()) {
+            input_j += input_point_stride / input.ld_col;
+            multiplied_input.load_from(input.base,
+              input.ld_row, input.ld_col,
+              args.input_rows, args.input_cols, input_i, input_j, args.channel_multiplier);
+          } else {
+            for (auto i = 0u; i < n_input_pointers; i++)
+            {
+              ws->inptr_array[i] += input_point_stride;
+            }
+          }
+
+          for (auto i = 0u; i < n_output_pointers; i++)
+          {
+            ws->outptr_array[i] += output_point_stride;
+          }
+        }
+
+        output_i += this->m_strat->get_output_rows();
+      }
+    }
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
new file mode 100644
index 0000000000..e2d05560a1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthwise_depthfirst.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename TInput, typename TOutput, typename TAccum>
+struct GenericDepthfirstKernelStrategyFunctionType
+{
+  using KernelType = std::function<void(const TInput *const *const, TOutput *const *const, const void *, const void *, const unsigned int, const unsigned int, const TAccum, const TAccum)>;
+};
+
+template <typename TInput, typename TOutput>
+struct GenericDepthfirstKernelStrategyFunctionType<TInput, TOutput, int32_t>
+{
+  using KernelType = std::function<void(const TInput *const *const, TOutput *const *const, const void *, const arm_gemm::Requantize32 &, unsigned int, unsigned int)>;
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+class GenericDepthfirstKernelStrategy
+{
+  unsigned int m_n_output_points;
+  arm_gemm::VLType m_vl_type;
+  unsigned int m_accumulator_depth_vl;
+
+  public:
+  GenericDepthfirstKernelStrategy(unsigned int n_output_points, arm_gemm::VLType vl_type, unsigned int accumulator_depth_vl=1)
+  : m_n_output_points(n_output_points), m_vl_type(vl_type), m_accumulator_depth_vl(accumulator_depth_vl)
+  {
+  }
+
+  virtual ~GenericDepthfirstKernelStrategy() = default;
+
+  virtual arm_gemm::VLType get_vl_type() const { return m_vl_type; }
+  virtual unsigned int get_accumulator_depth_vl() const { return m_accumulator_depth_vl; }
+  virtual unsigned int get_n_output_points() const { return m_n_output_points; }
+
+  using KernelType = typename GenericDepthfirstKernelStrategyFunctionType<TInput, TOutput, TAccum>::KernelType;
+  virtual KernelType get_kernel(void) const = 0;
+};
+
+template <typename TInput,
+          typename TWeight=TInput,
+          typename TOutput=TInput,
+          typename TAccum=typename DefaultTAccum<TInput>::Type,
+          typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class GenericDepthfirstStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+  protected:
+  using KernelStrategyType = GenericDepthfirstKernelStrategy<TInput, TWeight, TOutput, TAccum>;
+  std::unique_ptr<KernelStrategyType> m_strategy;
+
+  public:
+  GenericDepthfirstStrategy(
+    KernelStrategyType *strat, unsigned int n_output_rows, unsigned int n_output_cols,
+    const DepthwiseArgs &args
+  )
+  : DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>(
+      n_output_rows, n_output_cols,
+      args.kernel_rows, args.kernel_cols,
+      args.stride_rows, args.stride_cols
+    ),
+    m_strategy(strat)
+  {
+  }
+
+  GenericDepthfirstStrategy(GenericDepthfirstStrategy &) = delete;
+  GenericDepthfirstStrategy operator=(GenericDepthfirstStrategy &) = delete;
+
+  arm_gemm::VLType get_vl_type(void) const override { return m_strategy->get_vl_type(); }
+  unsigned int get_accumulator_depth_vl(void) const override { return m_strategy->get_accumulator_depth_vl(); }
+
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    interleaves::PackingArguments packing_args(
+      this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+      false, sizeof(TAccum), this->uses_premultiply(),  // Don't pack the bias
+      this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
+      [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+      { return this->get_kernel_packing_point(idx, x, y); }
+    );
+    return interleaves::get_storage_size_generic(packing_args, args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer,
+    const void *biases, const OutputStage &,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleaves::PackingArguments packing_args(
+      this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+      false, sizeof(TAccum), this->uses_premultiply(),  // Don't pack the bias
+      this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
+      [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+      { return this->get_kernel_packing_point(idx, x, y); }
+    );
+    interleaves::pack_parameters_generic(
+      packing_args, args, buffer, biases, weights, ld_weight_col, ld_weight_row);
+  }
+
+  const typename KernelStrategyType::KernelType get_kernel() const { return m_strategy->get_kernel(); }
+};
+
+// Use a templated function to marshal arguments when executing the kernel.
+template <typename OutputStage> struct DepthwiseDepthfirstGenericKernelCall;
+
+template <>
+struct DepthwiseDepthfirstGenericKernelCall<Nothing>
+{
+  template <typename StratType, typename WorkspaceType, typename TAccum>
+  static void execute(
+    const StratType *strat, const WorkspaceType *ws, const Nothing &,
+    const TAccum *bias, const void *params,
+    const unsigned int n_kernel_points, const unsigned int n_output_channels
+  )
+  {
+    strat->get_kernel()(
+      ws->inptr_array,
+      ws->outptr_array,
+      params, bias,
+      n_kernel_points, n_output_channels,
+      ws->activation_min, ws->activation_max
+    );
+  }
+};
+
+template <>
+struct DepthwiseDepthfirstGenericKernelCall<arm_gemm::Requantize32>
+{
+  template <typename StratType, typename WorkspaceType>
+  static void execute(
+    const StratType *strat, const WorkspaceType *ws, const arm_gemm::Requantize32 &qp,
+    const int32_t *, const void *params,
+    const unsigned int n_kernel_points, const unsigned int n_output_channels
+  )
+  {
+    strat->get_kernel()(
+      ws->inptr_array,
+      ws->outptr_array,
+      params, qp,
+      n_kernel_points, n_output_channels
+    );
+  }
+};
+
+
+/* Workspace Element for an array of input pointers as consumed by the
+ * "Generic" depthwise kernels.
+ */
+template <typename T>
+class GenericInputArrayElement
+{
+  public:
+  struct Workspace
+  {
+    const T **inptr_array;
+  };
+
+  template <class OutputStage>
+  static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
+    return sizeof(T **) * args.strategy->get_output_rows() * args.strategy->get_output_cols() * kernel_points;
+  }
+
+  template <class WorkspaceType, class OutputStage>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    ws->inptr_array = reinterpret_cast<const T**>(buffer);
+    return reinterpret_cast<char *>(buffer) + get_element_size(args);
+  }
+};
+
+template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
+          typename TAccum=typename DefaultTAccum<TInput>::Type,
+          typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+  using StratType = GenericDepthfirstStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>;
+  using Parent = DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>;
+  using WorkspaceManager = Workspace<
+    OutputArrayElement<TOutput>,
+    GenericInputArrayElement<TInput>,
+    InputBufferElement<TInput>,
+    IntermediateBufferElement<TInput>,
+    ActivationsElement<TAccum, OutputStage>
+  >;
+  using WorkingSpace = typename WorkspaceManager::WorkspaceType;
+  const TAccum *m_bias = nullptr;
+
+  public:
+  DepthwiseDepthfirstGeneric(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os={})
+  : Parent(strat, args, os)
+  {
+  }
+
+  DepthwiseDepthfirstGeneric(DepthwiseDepthfirstGeneric &) = delete;
+  DepthwiseDepthfirstGeneric &operator=(DepthwiseDepthfirstGeneric &) = delete;
+
+  void pack_parameters(
+    void *buffer, const void *biases,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) override
+  {
+    Parent::pack_parameters(buffer, biases, weights, ld_weight_col, ld_weight_row);
+    m_bias = reinterpret_cast<const TAccum *>(biases);  // Get a copy of the biases
+    depthwise_depthfirst::stash_bias(this->get_output_stage(), m_bias);
+  }
+
+  size_t get_working_size_per_thread() const override
+  {
+    DepthwiseArgs args(this->m_args);
+    return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage()));
+  }
+
+  void initialise_working_space(void *buffer) const override
+  {
+    DepthwiseArgs args(this->m_args);
+    return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage()));
+  }
+
+  protected:
+  void fill_inptr_array(const DepthwiseArgs &args,
+    const TensorSpec<const TInput *> &input,
+    const TInput **inptr_array, TInput *input_buffer,
+    const unsigned int input_i, const unsigned int input_j,
+    const unsigned int input_pad_top, const unsigned int input_pad_left) const override
+  {
+    fill_pointer_array_generic_kernel<const TInput>(
+      inptr_array,
+      this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+      args.kernel_rows, args.kernel_cols,
+      args.stride_rows, args.stride_cols,
+      input.base,
+      input.ld_row, input.ld_col,
+      input_buffer,
+      input_pad_top, args.input_rows - input_i,
+      input_pad_left, args.input_cols - input_j
+    );
+  }
+
+  void compute_tile_padded(
+    const DepthwiseArgs &args,
+    unsigned int output_i, unsigned int output_j,
+    unsigned int channel_start, unsigned int channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    const void *parameters,
+    void *working_space_raw
+  ) const override
+  {
+    // Get the working space
+    WorkingSpace *ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
+
+    const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
+    const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+    const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+    const int ij = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
+    const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
+    const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+    Tile<TInput> multiplied_input;
+    this->initialise_inptr_array(args, channel_start, channel_end, input,
+      ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
+      input_i, input_j, input_pad_top, input_pad_left, multiplied_input);
+
+    // Compute the output pointer array
+    fill_pointer_array<TOutput>(
+      ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+      output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
+      output.ld_row, output.ld_col,
+      ws->output_buffer,
+      0, args.output_rows - output_i, // Top padding, # valid rows
+      0, args.output_cols - output_j  // Left padding, # valid columns
+    );
+
+    // Execute the kernel
+    DepthwiseDepthfirstGenericKernelCall<OutputStage>::execute(
+      reinterpret_cast<const StratType *>(this->m_strat.get()), ws,
+      this->get_output_stage(), m_bias, parameters,
+      args.kernel_rows * args.kernel_cols,
+      channel_end - channel_start
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
new file mode 100644
index 0000000000..b93caa2aaa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
@@ -0,0 +1,604 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthwise_depthfirst.hpp"
+#include "interleaves/generic_quantized_dot_product.hpp"
+
+#include <limits>
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+class DepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, Nothing>
+{
+  using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, Nothing>;
+
+  protected:
+  virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const
+  {
+    return interleaves::PackingArguments(
+      args.kernel_rows, args.kernel_cols, sizeof(TWeight),
+      true, sizeof(TAccum), this->uses_premultiply(),
+      this->get_vl_type(),
+      sizeof(TAccum), 1,
+      [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool
+      {
+        if (pos < args.kernel_rows * args.kernel_cols)
+        {
+          y = pos % args.kernel_cols;
+          x = pos / args.kernel_cols;
+          return true;
+        }
+        return false;
+      }
+    );
+  }
+  
+  bool uses_premultiply() const override {
+    return false;
+  }
+
+  public:
+  using Parent::Parent;
+
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleaves::get_storage_size_generic(this->get_packing_args(args), args);
+  }
+
+  void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const Nothing &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
+  {
+    interleaves::pack_parameters_generic(
+      this->get_packing_args(args), args,
+      buffer, biases, weights, ld_weight_col, ld_weight_row
+    );
+  }
+
+  using KernelType = std::function<void(
+    const TInput *const *,  // Input pointers
+    TOutput *const *,  // Output pointers
+    const void *,  // Ravelled bias, weights, and quantization parameters
+    unsigned int,  // # output channels
+    TAccum, TAccum  // Min and max activation clamps
+  )>;
+  virtual KernelType get_kernel(void) const = 0;
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput>
+class DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t> : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+  using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
+
+  public:
+  using Parent::Parent;
+
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleaves::quantized::get_storage_size(args, this->get_vl_type(), this->get_accumulator_depth_vl());
+  }
+
+  void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
+  {
+    interleaves::quantized::pack_parameters<TWeight>(
+      buffer, reinterpret_cast<const int32_t *>(biases),
+      reinterpret_cast<const TWeight *>(weights), ld_weight_col, ld_weight_row,
+      args, qp, this->get_vl_type(), this->get_accumulator_depth_vl()
+    );
+  }
+
+  using KernelType = std::function<void(
+    const TInput *const *,  // Input pointers
+    TOutput *const *,  // Output pointers
+    const void *,  // Ravelled bias, weights, and quantization parameters
+    unsigned int,  // # output channels
+    const arm_gemm::Requantize32 &
+  )>;
+  virtual KernelType get_kernel(void) const = 0;
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+class GenericDepthfirstMultiplierKernelStrategy
+{
+  const arm_gemm::VLType m_vl_type;
+  const unsigned int m_output_rows, m_output_cols;
+
+  public:
+  GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
+  : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
+  {
+  }
+
+  virtual ~GenericDepthfirstMultiplierKernelStrategy() = default;
+
+  arm_gemm::VLType get_vl_type(void) const { return m_vl_type; }
+  unsigned int get_output_rows(void) const { return m_output_rows; }
+  unsigned int get_output_cols(void) const { return m_output_cols; }
+
+  using KernelType = std::function<void(
+    const TInput *const *,  // Input pointers
+    TOutput *const *,  // Output pointers
+    const TWeight *,  // Ravelled weight parameters
+    const TAccum *,  // Bias,
+    unsigned int, unsigned int,  // Number of kernel points, number of output channels
+    TAccum, TAccum  // Activation minimum and maximum
+  )>;
+  virtual KernelType get_kernel(void) const = 0;
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+class GenericDepthfirstMultiplierKernelStrategy<TInput, TWeight, TOutput, int32_t>
+{
+  const arm_gemm::VLType m_vl_type;
+  const unsigned int m_output_rows, m_output_cols;
+
+  public:
+  GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
+  : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
+  {
+  }
+
+  virtual ~GenericDepthfirstMultiplierKernelStrategy() = default;
+
+  arm_gemm::VLType get_vl_type(void) const { return m_vl_type; }
+  unsigned int get_output_rows(void) const { return m_output_rows; }
+  unsigned int get_output_cols(void) const { return m_output_cols; }
+
+  using KernelType = std::function<void(
+    const TInput *const *,  // Input pointers
+    TOutput *const *,  // Output pointers
+    const TWeight *,  // Ravelled weight parameters
+    const int32_t *,  // Bias,
+    unsigned int, unsigned int,  // Number of kernel points, number of output channels
+    const int32_t *, const int32_t *, const int32_t *,  // Per-channel left-shifts, multipliers, right-shifts (need to account for start channel)
+    const arm_gemm::Requantize32 &
+  )>;
+  virtual KernelType get_kernel(void) const = 0;
+};
+
+template <typename TInput,
+          typename TWeight=TInput,
+          typename TOutput=TInput,
+          typename TAccum=typename DefaultTAccum<TInput>::Type,
+          typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class GenericDepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+  using KernelStrategyType = GenericDepthfirstMultiplierKernelStrategy<TInput, TWeight, TOutput, TAccum>;
+  std::unique_ptr<KernelStrategyType> m_kern;
+
+  protected:
+  virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const
+  {
+    return interleaves::PackingArguments(
+      args.kernel_rows, args.kernel_cols, sizeof(TWeight),
+      false, sizeof(TAccum), this->uses_premultiply(),
+      this->get_vl_type(),
+      sizeof(TAccum), 1,
+      [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool
+      {
+        if (pos < args.kernel_rows * args.kernel_cols)
+        {
+          y = pos % args.kernel_cols;
+          x = pos / args.kernel_cols;
+          return true;
+        }
+        return false;
+      }
+    );
+  }
+  
+  bool uses_premultiply() const override {
+    return false;
+  }
+
+  public:
+  GenericDepthfirstMultiplierStrategy(KernelStrategyType *kern, const DepthwiseArgs &args)
+  : DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>(
+      kern->get_output_rows(), kern->get_output_cols(),
+      args.kernel_rows, args.kernel_cols,
+      args.stride_rows, args.stride_cols
+    ),
+    m_kern(kern)
+  {
+  };
+
+  arm_gemm::VLType get_vl_type(void) const override { return m_kern->get_vl_type(); }
+  const typename KernelStrategyType::KernelType get_kernel(void) const { return m_kern->get_kernel(); }
+
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleaves::get_storage_size_generic(this->get_packing_args(args), args);
+  }
+
+  void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const OutputStage &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
+  {
+    interleaves::pack_parameters_generic(
+      this->get_packing_args(args), args,
+      buffer, biases, weights, ld_weight_col, ld_weight_row
+    );
+  }
+};
+
+// Specialise elements of the wrapper based on the type of kernel.
+namespace depthfirst_multiplier {
+
+/* Working space element which contains a pointer for each row of input, a row
+ * of padding, and a space which can be used to construct an NCHW-ordered patch
+ * of input.
+ */
+template <typename T, bool IsGeneric=false, typename OutputStage=Nothing>
+class InputPatchElement
+{
+  public:
+  struct Workspace
+  {
+    constexpr static bool InputPatchIsGeneric = IsGeneric;
+    const T **input_rows;
+    T *input_padding;
+    T *input_patch;
+  };
+
+  static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    return sizeof_input_rows(args) + sizeof_input_padding(args) + sizeof_input_patch(args);
+  }
+
+  template <class WorkspaceType>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    auto buffer_bytes = reinterpret_cast<char *>(buffer);
+
+    ws->input_rows = reinterpret_cast<const T **>(buffer_bytes);
+    buffer_bytes += sizeof_input_rows(args);
+
+    ws->input_padding = reinterpret_cast<T*>(buffer_bytes);
+    buffer_bytes += sizeof_input_padding(args);
+
+    ws->input_patch = reinterpret_cast<T*>(buffer_bytes);
+    buffer_bytes += sizeof_input_patch(args);
+
+    // Initialise the padding
+    memset(ws->input_padding,
+           get_input_buffer_fill_value(args.output_stage),
+           sizeof_input_padding(args));
+
+    return buffer_bytes;
+  }
+
+  protected:
+  static size_t sizeof_input_rows(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    if (IsGeneric)
+    {
+      return sizeof(T *) * args.strategy->get_output_rows() * args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
+    }
+    else
+    {
+      return sizeof(T *) * args.strategy->get_input_rows();
+    }
+  }
+
+  static size_t sizeof_input_padding(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    // Round-up the number of columns to be a whole number of QUADS
+    auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T));
+    return sizeof(T) * input_cols;
+  }
+
+  static size_t sizeof_input_patch(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    if (IsGeneric)
+    {
+      // Round-up the number of columns to be a whole number of QUADS
+      auto output_cols = arm_gemm::roundup<size_t>(args.strategy->get_output_cols(), 16 / sizeof(T));
+      const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
+      return sizeof(T) * kernel_points * args.strategy->get_output_rows() * output_cols;
+    }
+    else
+    {
+      // Round-up the number of columns to be a whole number of QUADS
+      auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T));
+      return sizeof(T) * args.strategy->get_input_rows() * input_cols;
+    }
+  }
+};
+
+template <bool IsGeneric, typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+struct StrategyType
+{
+  using Type = DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, TAccum>;
+
+  template <typename WorkspaceType>
+  static void execute(
+    const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
+    const OutputStage &, const unsigned int,
+    const void *parameters, const void *
+  )
+  {
+    strat->get_kernel()(
+      ws->input_rows,
+      ws->outptr_array,
+      parameters, args.channel_multiplier,
+      ws->activation_min, ws->activation_max
+    );
+  }
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+struct StrategyType<true, TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+  using Type = GenericDepthfirstMultiplierStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>;
+
+  template <typename WorkspaceType>
+  static void execute(
+    const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
+    const OutputStage &, const unsigned int start_output_channel,
+    const void *parameters, const void *bias
+  )
+  {
+    strat->get_kernel()(
+      ws->input_rows, ws->outptr_array,
+      reinterpret_cast<const TWeight *>(parameters),
+      bias == nullptr ? nullptr : reinterpret_cast<const TAccum *>(bias) + start_output_channel,
+      strat->get_kernel_rows() * strat->get_kernel_cols(),
+      args.channel_multiplier,
+      ws->activation_min, ws->activation_max
+    );
+  }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+struct StrategyType<false, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+  using Type = DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t>;
+
+  template <typename WorkspaceType>
+  static void execute(
+    const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
+    const arm_gemm::Requantize32 &qp, const unsigned int,
+    const void *parameters, const void *
+  )
+  {
+    strat->get_kernel()(
+      ws->input_rows,
+      ws->outptr_array,
+      parameters, args.channel_multiplier,
+      qp
+    );
+  }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+struct StrategyType<true, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+  using Type = GenericDepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
+
+  template <typename WorkspaceType>
+  static void execute(
+    const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
+    const arm_gemm::Requantize32 &qp, const unsigned int start_output_channel,
+    const void *parameters, const void *
+  )
+  {
+    auto get_ptr = [start_output_channel] (const int32_t *ptr) -> const int32_t *
+    {
+      return ptr == nullptr ? nullptr : ptr + start_output_channel;
+    };
+
+    strat->get_kernel()(
+      ws->input_rows, ws->outptr_array,
+      reinterpret_cast<const TWeight *>(parameters),
+      get_ptr(qp.bias),
+      strat->get_kernel_rows() * strat->get_kernel_cols(),
+      args.channel_multiplier,
+      get_ptr(qp.per_channel_left_shifts),
+      get_ptr(qp.per_channel_muls),
+      get_ptr(qp.per_channel_right_shifts),
+      qp
+    );
+  }
+};
+
+template <bool IsGeneric> struct PrepareInputSample;
+
+template <> struct PrepareInputSample<false>
+{
+  template <typename WorkspaceType, typename StrategyType, typename T>
+  static void execute(
+    const DepthwiseArgs &, WorkspaceType *ws, const StrategyType *strat,
+    T *base_ptr, size_t ld_row, size_t ld_col,
+    const unsigned int input_pad_top, const unsigned int valid_rows,
+    const unsigned int input_pad_left, const unsigned int valid_cols
+  )
+  {
+    fill_nchw_patch_array(
+      ws->input_rows, ws->input_patch, strat->get_input_rows(), strat->get_input_cols(),
+      base_ptr, ld_row, ld_col,
+      ws->input_padding,
+      input_pad_top, valid_rows,
+      input_pad_left, valid_cols
+    );
+  }
+};
+
+template <> struct PrepareInputSample<true>
+{
+  template <typename WorkspaceType, typename StrategyType, typename T>
+  static void execute(
+    const DepthwiseArgs &args, WorkspaceType *ws, const StrategyType *strat,
+    T *base_ptr, size_t ld_row, size_t ld_col,
+    const unsigned int input_pad_top, const unsigned int valid_rows,
+    const unsigned int input_pad_left, const unsigned int valid_cols
+  )
+  {
+    fill_patch_array_generic_kernel(
+      ws->input_rows, ws->input_patch,
+      strat->get_output_rows(), strat->get_output_cols(),
+      args.kernel_rows, args.kernel_cols,
+      args.stride_rows, args.stride_cols,
+      base_ptr, ld_row, ld_col,
+      ws->input_padding,
+      input_pad_top, valid_rows,
+      input_pad_left, valid_cols
+    );
+  }
+};
+
+}  // namespace depthfirst_multiplier
+
+template <typename TInput,
+          typename TWeight=TInput,
+          typename TOutput=TInput,
+          typename TAccum=typename DefaultTAccum<TInput>::Type,
+          bool is_generic=false,
+          typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwiseDepthfirstMultiplier : public DepthfirstDriver<TInput, TWeight, TOutput>
+{
+  protected:
+  using StratType = typename depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
+  using WorkspaceManager = Workspace<
+    OutputArrayElement<TOutput>,
+    depthfirst_multiplier::InputPatchElement<TInput, is_generic, OutputStage>,
+    ActivationsElement<TOutput, OutputStage>
+  >;
+  using WorkingSpace = typename WorkspaceManager::WorkspaceType;
+
+  OutputStage m_os;  // Copy of the output parameters
+  const void *m_bias = nullptr;  // Copy of the bias (should we need it)
+
+  bool uses_premultiply() const override {
+    return false;
+  }
+
+  public:
+  DepthwiseDepthfirstMultiplier(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
+  : DepthfirstDriver<TInput, TWeight, TOutput>(strat, args), m_os(os)
+  {
+  }
+
+  DepthwiseDepthfirstMultiplier(DepthwiseDepthfirstMultiplier &) = delete;
+  DepthwiseDepthfirstMultiplier &operator=(DepthwiseDepthfirstMultiplier &) = delete;
+
+  size_t get_storage_size(void) const override
+  {
+    return reinterpret_cast<const StratType *>(this->m_strat.get())
+      ->get_storage_size(this->m_args);
+  }
+
+  void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+  {
+    reinterpret_cast<const StratType *>(this->m_strat.get())
+      ->pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row);
+    m_bias = biases;
+    depthwise_depthfirst::stash_bias(m_os, biases);
+  }
+
+  size_t get_working_size_per_thread() const override
+  {
+    DepthwiseArgs args(this->m_args);
+    return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os));
+  }
+
+  void initialise_working_space(void *buffer) const override
+  {
+    DepthwiseArgs args(this->m_args);
+    return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os));
+  }
+
+  void compute_tile_padded(
+    const DepthwiseArgs &args,
+    unsigned int output_i, unsigned int output_j,
+    unsigned int output_channel_start, unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    const void *parameters,
+    void *working_space_raw
+  ) const override
+  {
+    // Get the working space
+    auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
+
+    const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
+    const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+    const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+    const int ij = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
+    const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
+    const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+    // Compute the output pointer array. We'll update this array after every
+    // invocation of the kernel.
+    fill_pointer_array(
+      ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+      output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
+      output.ld_row, output.ld_col,
+      ws->output_buffer,
+      0, args.output_rows - output_i, // Top padding, # valid rows
+      0, args.output_cols - output_j  // Left padding, # valid columns
+    );
+
+    // Compute the parameter stride
+    DepthwiseArgs single_iter(args);
+    single_iter.input_channels = 1;
+    const size_t parameter_stride = reinterpret_cast<const StratType *>(this->m_strat.get())
+      ->get_storage_size(single_iter);
+
+    for (; output_channel_start < output_channel_end;
+         output_channel_start += args.channel_multiplier)
+    {
+      // Compute the input pointer array
+      const auto input_channel = output_channel_start / args.channel_multiplier;
+
+      // Construct the input patch
+      depthfirst_multiplier::PrepareInputSample<is_generic>::execute(
+        args, ws, this->m_strat.get(),
+        input.base + input_channel + input_i*input.ld_row + input_j*input.ld_col, input.ld_row, input.ld_col,
+        input_pad_top, args.input_rows - input_i,
+        input_pad_left, args.input_cols - input_j
+      );
+
+      // Execute the kernel
+      depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::execute(
+        args, ws, reinterpret_cast<const StratType *>(this->m_strat.get()), m_os, output_channel_start,
+        parameters, m_bias
+      );
+
+      // Update the output pointers
+      for (unsigned int n = 0; n < this->m_strat->get_output_rows() * this->m_strat->get_output_cols(); n++)
+      {
+        ws->outptr_array[n] += args.channel_multiplier;
+      }
+
+      // Progress the parameters
+      parameters = reinterpret_cast<const char *>(parameters) + parameter_stride;
+    }
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
new file mode 100644
index 0000000000..8fef6f8ae0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_planar.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+// This can only be built if the target/compiler supports FP16 arguments.
+#if defined(__ARM_FP16_ARGS)
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#include "kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif  // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif  // defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+#if defined(__aarch64__)
+#if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  bool prefer_premultiply(const DepthwiseArgs &args) {
+    if ((args.stride_rows != args.stride_cols) || (args.kernel_rows != args.kernel_cols))
+    {
+      return false;
+    }
+
+    unsigned int threshold;
+
+    if (args.stride_rows == 1 && args.kernel_rows == 3)
+    {
+      threshold = 30;
+    }
+    else if (args.stride_rows == 1 && args.kernel_rows == 5)
+    {
+      threshold = 31;
+    }
+    else if (args.stride_rows == 2 && args.kernel_rows == 3)
+    {
+      threshold = 11;
+    }
+    else if (args.stride_rows == 2 && args.kernel_rows == 5)
+    {
+      threshold = 19;
+    } else
+    {
+      return false;
+    }
+
+    return args.channel_multiplier <= threshold;
+  }
+
+  template <class Strategy>
+  unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    if (args.channel_multiplier > 1 && !prefer_premultiply(args))
+    {
+      return std::numeric_limits<unsigned int>::max();
+    }
+
+    // First-pass: compute the number of output pixels which will be computed.
+    return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+           arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
+           arm_gemm::iceildiv(
+            (long unsigned) args.input_channels * args.channel_multiplier,
+            arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+          );
+  }
+
+  template <class Strategy>
+  unsigned int planar_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    // First-pass: compute the number of output pixels which will be computed.
+    return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+           args.output_cols *
+           arm_gemm::iceildiv(
+            (long unsigned) args.input_channels * args.channel_multiplier,
+            arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+          );
+  }
+
+  unsigned int multiplier_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    return prefer_premultiply(args)? std::numeric_limits<unsigned int>::max() : 0;
+  }
+
+  unsigned int not_preferred(const DepthwiseArgs &, const Nothing &) __attribute__ ((unused));
+  unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
+  {
+    return std::numeric_limits<unsigned int>::max();
+  }
+#endif  // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif  // defined(__aarch64__)
+}
+
+static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = {
+#if defined(__aarch64__)
+#if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
+    constraint(is_supported<sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+               cpu_has_sme2),
+    cycle_estimate<sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
+    constraint(is_supported<sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+               cpu_has_sme2),
+    cycle_estimate<sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+              cpu_has_sme2),
+    cycle_estimate<sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint(is_supported<sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+               cpu_has_sme2),
+    cycle_estimate<sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+               cpu_has_sme2),
+    cycle_estimate<sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
+    constraint(is_supported<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+               cpu_has_sve),
+    cycle_estimate<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
+    constraint(is_supported<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+               cpu_has_sve),
+    cycle_estimate<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+              cpu_has_sve),
+    cycle_estimate<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+               cpu_has_sve),
+    cycle_estimate<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+               cpu_has_sve),
+    cycle_estimate<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
+    constraint(is_supported<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+               cpu_has_fp16),
+    cycle_estimate<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
+    constraint(is_supported<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+               cpu_has_fp16),
+    cycle_estimate<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+               cpu_has_fp16),
+    cycle_estimate<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+               cpu_has_fp16),
+    cycle_estimate<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+               cpu_has_fp16),
+    cycle_estimate<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_generic_output3x3_mla_depthfirst",
+    constraint(cpu_has_fp16),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto kern = new a64_fp16_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstStrategy<__fp16>(kern, 3, 3, args);
+      return new DepthwiseDepthfirstGeneric<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    constraint(cpu_has_fp16, has_channel_multiplier),
+    multiplier_cycle_estimate,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto kern = new a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstMultiplierStrategy<__fp16>(kern, args);
+      return new DepthwiseDepthfirstMultiplier<__fp16, __fp16, __fp16, __fp16, true>(strat, args);
+    },
+  },
+#endif  // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif  // defined(__aarch64__)
+  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
+};
+
+template <>
+const DepthwiseImplementation<__fp16> *depthwise_implementation_list()
+{
+  return depthwise_fp16_methods;
+}
+
+template UniqueDepthwiseCommon<__fp16> depthwise(const DepthwiseArgs &, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<__fp16>(const DepthwiseArgs &, const Nothing &);
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
new file mode 100644
index 0000000000..760328f3ba
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
@@ -0,0 +1,539 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_planar.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#include "interleaves/list.hpp"
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp"
+#include "kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp"
+
+#include "kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp"
+#include "kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp"
+#include "kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp"
+#include "kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp"
+
+#include "kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include "kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+#include "kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif  // defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+#if defined(__aarch64__)
+  bool prefer_premultiply(const DepthwiseArgs &args) {
+    if ((args.stride_rows != args.stride_cols) || (args.kernel_rows != args.kernel_cols))
+    {
+      return false;
+    }
+
+    unsigned int threshold;
+
+    if (args.stride_rows == 1 && args.kernel_rows == 3)
+    {
+      threshold = 18;
+    }
+    else if (args.stride_rows == 1 && args.kernel_rows == 5)
+    {
+      threshold = 5;
+    }
+    else if (args.stride_rows == 2 && args.kernel_rows == 3)
+    {
+      threshold = 5;
+    }
+    else if (args.stride_rows == 2 && args.kernel_rows == 5)
+    {
+      threshold = 12;
+    } else
+    {
+      return false;
+    }
+
+    return args.channel_multiplier <= threshold;
+  }
+
+  template <class Strategy>
+  unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    if (args.channel_multiplier > 1 && !prefer_premultiply(args))
+    {
+      return std::numeric_limits<unsigned int>::max();
+    }
+
+    // First-pass: compute the number of output pixels which will be computed.
+    return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+           arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
+           arm_gemm::iceildiv(
+            (long unsigned) args.input_channels * args.channel_multiplier,
+            arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+          );
+  }
+
+  template <class Strategy>
+  unsigned int planar_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    // First-pass: compute the number of output pixels which will be computed.
+    return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+           args.output_cols *
+           arm_gemm::iceildiv(
+            (long unsigned) args.input_channels * args.channel_multiplier,
+            arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+          );
+  }
+
+  template <class Strategy>
+  unsigned int fast_mode_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    // First-pass: compute the number of output pixels which will be computed.
+    return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+           arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
+           arm_gemm::iceildiv(
+            (long unsigned) args.input_channels * args.channel_multiplier,
+            arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+          ) * 2 / 3;
+  }
+
+  unsigned int multiplier_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    return prefer_premultiply(args)? std::numeric_limits<unsigned int>::max() : 0;
+  }
+
+  unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
+  {
+    return std::numeric_limits<unsigned int>::max();
+  }
+
+  bool fast_mode_enabled(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+  bool fast_mode_enabled(const DepthwiseArgs &args, const void *)
+  {
+    return args.fast_mode;
+  }
+#endif // defined(__aarch64__)
+}
+
+static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za",
+    constraint(fast_mode_enabled,
+               cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za",
+    constraint(fast_mode_enabled,
+               cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za",
+    constraint(fast_mode_enabled,
+               cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za",
+    constraint(fast_mode_enabled,
+               cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32_planar_3x3_s1_4rows_mla_za",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_planar_3x3_s1_4rows_mla_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    [] (const DepthwiseArgs &args, const Nothing &os) -> unsigned int {
+      // Heuristic, don't prefer this kernel unless the input plane is greater
+      // than the number of channels.
+      if (args.input_rows * args.input_cols < args.input_channels)
+        return UINT32_MAX;
+
+      return planar_cycle_estimate<sme2_fp32_planar_3x3_s1_4rows_mla_za>(args, os);
+    },
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_planar_3x3_s1_4rows_mla_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32_planar_3x3_s2_4rows_mla_za",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    planar_cycle_estimate<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_planar_3x3_s2_4rows_mla_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32_planar_5x5_s1_4rows_mla_za",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_planar_5x5_s1_4rows_mla_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_planar_5x5_s1_4rows_mla_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32_planar_5x5_s2_4rows_mla_za",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_planar_5x5_s2_4rows_mla_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_planar_5x5_s2_4rows_mla_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
+    constraint(cpu_has_sme,  cpu_has_sme2,
+               is_supported<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>),
+    cycle_estimate<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>),
+    cycle_estimate<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>),
+    cycle_estimate<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>),
+    cycle_estimate<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
+    constraint(is_supported<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+               cpu_has_sve),
+    cycle_estimate<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
+    constraint(is_supported<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+               cpu_has_sve),
+    cycle_estimate<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+              cpu_has_sve),
+    cycle_estimate<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+               cpu_has_sve),
+    cycle_estimate<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+               cpu_has_sve),
+    cycle_estimate<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_generic_output3x3_mla_depthfirst",
+    constraint(cpu_has_sve),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto kern = new sve_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
+      return new DepthwiseDepthfirstGeneric<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
+    constraint(is_supported<sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
+               cpu_has_sve, has_channel_multiplier),
+    multiplier_cycle_estimate,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
+    constraint(is_supported<sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
+               cpu_has_sve, has_channel_multiplier),
+    multiplier_cycle_estimate,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    constraint(cpu_has_sve, has_channel_multiplier),
+    multiplier_cycle_estimate,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto kern = new sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
+      return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
+    constraint(is_supported<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>),
+    cycle_estimate<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
+    constraint(is_supported<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>),
+    cycle_estimate<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>),
+    cycle_estimate<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>),
+    cycle_estimate<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>),
+    cycle_estimate<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_generic_output3x3_mla_depthfirst",
+    nullptr,
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto kern = new a64_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
+      return new DepthwiseDepthfirstGeneric<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
+    constraint(is_supported<a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
+               has_channel_multiplier),
+    multiplier_cycle_estimate,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
+    constraint(is_supported<a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
+               has_channel_multiplier),
+    multiplier_cycle_estimate,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    constraint(has_channel_multiplier),
+    multiplier_cycle_estimate,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto kern = new a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
+      return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
+    },
+  },
+#endif  // defined(__aarch64__)
+  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
+};
+
+template <>
+const DepthwiseImplementation<float> *depthwise_implementation_list()
+{
+  return depthwise_fp32_methods;
+}
+
+template UniqueDepthwiseCommon<float> depthwise(const DepthwiseArgs &, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<float>(const DepthwiseArgs &, const Nothing &);
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
new file mode 100644
index 0000000000..82821af1e6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthwise.hpp"
+
+#include <cstddef>
+#include <functional>
+
+using arm_gemm::Nothing;
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+struct DepthwiseImplementation
+{
+  const DepthwiseMethod method;
+  const char *name;
+  std::function<bool(const DepthwiseArgs &, const OutputStage &)> is_supported;
+  std::function<uint64_t(const DepthwiseArgs &, const OutputStage &)> cycle_estimate;
+  std::function<DepthwiseCommon<TInput, TWeight, TOutput> *(const DepthwiseArgs &, const OutputStage &)> initialise;
+
+  bool get_is_supported(const DepthwiseArgs &args, const OutputStage &os) const
+  {
+    return (is_supported == nullptr) ? true : is_supported(args, os);
+  }
+
+  uint64_t get_cycle_estimate(const DepthwiseArgs &args, const OutputStage &os) const
+  {
+    return (cycle_estimate == nullptr) ? 0 : cycle_estimate(args, os);
+  }
+
+  DepthwiseCommon<TInput, TWeight, TOutput> *get_instance(const DepthwiseArgs &args, const OutputStage &os) const
+  {
+    auto impl = initialise(args, os);
+    impl->set_name(std::string(name));
+    return impl;
+  }
+};
+
+/**
+ * \relates DepthwiseImplementation
+ */
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *depthwise_implementation_list();
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+bool find_implementation(
+  const DepthwiseArgs &args,
+  const OutputStage &os,
+  const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> * &selected
+)
+{
+  selected = nullptr;
+  uint64_t best_cycle_estimate = UINT64_MAX;
+
+  const auto *impl = depthwise_implementation_list<TInput, TWeight, TOutput, OutputStage>();
+  for (; impl->method != DepthwiseMethod::DEFAULT; impl++)
+  {
+    const bool has_cfg = (args.config != nullptr);
+    const auto &cfg = args.config;
+
+    if (
+      !impl->get_is_supported(args, os) ||  // Problem is unsupported
+      (has_cfg && cfg->method != DepthwiseMethod::DEFAULT && cfg->method != impl->method) ||
+      (has_cfg && cfg->filter != "" && !std::strstr(impl->name, cfg->filter.c_str()))
+    )
+    {
+      continue;
+    }
+
+    const auto cycle_estimate = impl->get_cycle_estimate(args, os);
+
+    if (cycle_estimate == 0)
+    {
+      selected = impl;
+      break;
+    }
+
+    if (selected == nullptr || cycle_estimate < best_cycle_estimate)
+    {
+      selected = impl;
+      best_cycle_estimate = cycle_estimate;
+    }
+  }
+
+  return (selected != nullptr);
+}
+
+template <typename TInput, typename TWeight, typename TOutput, class OutputStage>
+std::vector<KernelDescription> get_compatible_kernels(const DepthwiseArgs &args, const OutputStage &os)
+{
+  std::vector<KernelDescription> kerns;
+
+  // Find the default implementation so we can flag it accordingly
+  const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *default_impl;
+  find_implementation<TInput, TWeight, TOutput, OutputStage>(args, os, default_impl);
+
+  for (auto impl = depthwise_implementation_list<TInput, TWeight, TOutput, OutputStage>();
+       impl->method != DepthwiseMethod::DEFAULT; impl++)
+  {
+    if (!impl->get_is_supported(args, os))
+    {
+      continue;
+    }
+
+    kerns.emplace_back(
+      impl->method, impl->name, impl == default_impl,
+      impl->get_cycle_estimate(args, os)
+    );
+  }
+
+  return kerns;
+}
+
+template <typename TInput, typename TWeight, typename TOutput, class OutputStage>
+UniqueDepthwiseCommon<TInput, TWeight, TOutput> depthwise(const DepthwiseArgs &args, const OutputStage &os)
+{
+  const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *impl = nullptr;
+  const bool success = find_implementation<TInput, TWeight, TOutput, OutputStage>(args, os, impl);
+  return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(success ? impl->get_instance(args, os) : nullptr);
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
new file mode 100644
index 0000000000..15064aeedc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Utilities for constructing functions which constrain which kernels are
+ * selected for a given depthwise problem.
+ *
+ * It is expected that this will be included in the files which list the
+ * available kernels. To avoid multiple definitions, an anonymous namespace is
+ * used.
+ */
+
+#pragma once
+
+#include "arm_gemm.hpp"
+#include "depthwise.hpp"
+
+namespace arm_conv
+{
+namespace depthwise
+{
+namespace
+{
+
+template <class OutputStage>
+using ConstraintFn = std::function<bool(const DepthwiseArgs &, const OutputStage &)>;
+
+using GenericConstraintFn = std::function<bool(const DepthwiseArgs &, const void *)>;
+
+GenericConstraintFn make_constraint(const GenericConstraintFn &f) __attribute__ ((unused));
+GenericConstraintFn make_constraint(const GenericConstraintFn &f)
+{
+  return f;
+}
+
+template <typename ... Fs>
+GenericConstraintFn make_constraint(const GenericConstraintFn &f, Fs ... fs)
+{
+  return [f, fs...] (const DepthwiseArgs &args, const void *os) -> bool {
+    return f(args, os) && make_constraint(fs...)(args, os);
+  };
+}
+
+template <typename OutputStage=Nothing, typename ... Fs>
+ConstraintFn<OutputStage> constraint(Fs ... fs)
+{
+  return [fs...] (const DepthwiseArgs &args, const OutputStage &os) -> bool {
+    return make_constraint(fs...)(args, &os);
+  };
+}
+
+// Some useful constraints
+template <class Strategy>
+bool is_supported(const DepthwiseArgs &args, const void *)
+{
+  return ((args.kernel_rows == Strategy::kernel_rows) &&
+          (args.kernel_cols == Strategy::kernel_cols) &&
+          (args.stride_rows == Strategy::stride_rows) &&
+          (args.stride_cols == Strategy::stride_cols));
+}
+
+bool cpu_has_dot_product(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_dot_product(const DepthwiseArgs &args, const void *)
+{
+  return args.cpu_info->has_dotprod();
+}
+
+bool cpu_has_sme(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_sme(const DepthwiseArgs &args, const void *)
+{
+  return args.cpu_info->has_sme();
+}
+
+bool cpu_has_sme2(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_sme2(const DepthwiseArgs &args, const void *)
+{
+  return args.cpu_info->has_sme2();
+}
+
+bool cpu_has_sve(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_sve(const DepthwiseArgs &args, const void *)
+{
+  return args.cpu_info->has_sve();
+}
+
+bool cpu_has_sve2(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_sve2(const DepthwiseArgs &args, const void *)
+{
+  return args.cpu_info->has_sve2();
+}
+
+bool cpu_has_fp16(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_fp16(const DepthwiseArgs &args, const void *)
+{
+  return args.cpu_info->has_fp16();
+}
+
+bool has_no_channel_multiplier(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool has_no_channel_multiplier(const DepthwiseArgs &args, const void *)
+{
+  return args.channel_multiplier == 1;
+}
+
+bool has_channel_multiplier(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool has_channel_multiplier(const DepthwiseArgs &args, const void *)
+{
+  return args.channel_multiplier > 1;
+}
+
+// Planar kernels require a "priming" step before the main processing loop.  The kernels can prime with left padding
+// or input data, but not right padding - which could be needed in some extreme cases such as a 5x5 kernel, width 1
+// padding 2.  These are rare enough and can be handled with other kernels anyway, so filter them out with this.
+bool no_prime_right_pad(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool no_prime_right_pad(const DepthwiseArgs &args, const void *)
+{
+  return (args.input_cols + args.padding.left) >= (args.kernel_cols - 1);
+}
+
+bool qp_has_no_left_shift(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused));
+bool qp_has_no_left_shift(const DepthwiseArgs &, const void *_qp)
+{
+  const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+  return qp->per_channel_requant ?
+    (qp->per_channel_left_shifts == nullptr) :
+    (qp->per_layer_left_shift == 0);
+}
+
+bool qp_zero_a_offset(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused));
+bool qp_zero_a_offset(const DepthwiseArgs &, const void *_qp)
+{
+  const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+  return qp->a_offset == 0;
+}
+
+template <typename T> bool qp_skip_clamp(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused));
+template <typename T> bool qp_skip_clamp(const DepthwiseArgs &, const void *_qp)
+{
+  const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+  return (qp->minval == std::numeric_limits<T>::min() &&
+          qp->maxval == std::numeric_limits<T>::max());
+}
+
+}  // namespace
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
new file mode 100644
index 0000000000..c3daaf04fe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthfirst_driver.hpp"
+#include "interleaves/generic.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename OutputStage>
+class IPlanarStrategy
+{
+  public:
+  virtual ~IPlanarStrategy() = default;
+  virtual unsigned int get_output_rows(void) const = 0;
+  virtual arm_gemm::VLType get_vl_type(void) const = 0;
+
+  virtual size_t get_storage_size(const DepthwiseArgs &) const = 0;
+  virtual void pack_parameters(
+    const DepthwiseArgs &args, void *buffer,
+    const void *biases, const OutputStage &,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const = 0;
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum,
+          typename OutputStage>
+struct PlanarKernelType;
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
+{
+  typedef void (*Type)(
+    const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+    unsigned int pad_top, unsigned int valid_input_rows,
+    unsigned int pad_left, unsigned int valid_input_cols,
+    const TWeight *, const TAccum *,
+    TOutput **, const size_t *, const size_t *, unsigned int output_cols,
+    unsigned int start_channels, unsigned int valid_channels,
+    TAccum act_min, TAccum act_max
+  );
+
+  template <typename WorkspaceType>
+  static inline void execute(
+    const Type fn,
+    const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+    unsigned int pad_top, unsigned int valid_input_rows,
+    unsigned int pad_left, unsigned int valid_input_cols,
+    const TWeight *weights, const TAccum *bias,
+    TOutput **outptrs, const size_t *outlds, const size_t *outvllds, unsigned int output_cols,
+    unsigned int start_channel, unsigned int valid_channels,
+    const Nothing &, const WorkspaceType *ws
+  )
+  {
+    fn(
+      inptr, ld_in_row, ld_in_col, ld_in_vl,
+      pad_top, valid_input_rows,
+      pad_left, valid_input_cols,
+      weights, bias,
+      outptrs, outlds, outvllds, output_cols,
+      start_channel, valid_channels,
+      ws->activation_min, ws->activation_max
+    );
+  }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+  typedef void (*Type)(
+    const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+    unsigned int pad_top, unsigned int valid_input_rows,
+    unsigned int pad_left, unsigned int valid_input_cols,
+    const TWeight *,
+    TOutput **, const size_t *, const size_t *, unsigned int output_cols,
+    unsigned int start_channel, unsigned int valid_channels,
+    const arm_gemm::Requantize32 &
+  );
+
+  template <typename WorkspaceType>
+  static inline void execute(
+    const Type fn,
+    const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+    unsigned int pad_top, unsigned int valid_input_rows,
+    unsigned int pad_left, unsigned int valid_input_cols,
+    const TWeight *weights, const int32_t *,
+    TOutput **outptrs, const size_t *outlds, const size_t *outldvls, unsigned int output_cols,
+    unsigned int first_channel, unsigned int valid_channels,
+    const arm_gemm::Requantize32 &qp, const WorkspaceType *
+  )
+  {
+    fn(
+      inptr, ld_in_row, ld_in_col, ld_in_vl,
+      pad_top, valid_input_rows,
+      pad_left, valid_input_cols,
+      weights,
+      outptrs, outlds, outldvls, output_cols,
+      first_channel, valid_channels,
+      qp
+    );
+  }
+};
+
+
+template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
+          typename TAccum=typename DefaultTAccum<TOutput>::Type,
+          typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class PlanarStrategy : public IPlanarStrategy<OutputStage>
+{
+  unsigned int m_kernel_rows, m_kernel_cols;
+  unsigned int m_stride_rows, m_stride_cols;
+  unsigned int m_output_rows;
+  arm_gemm::VLType m_vl_type;
+
+  protected:
+  virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const
+  {
+    // Get the kernel point to pack at the given index; return false to
+    // indicate that this index (and all greater indices) is out of range.
+    if (m_kernel_rows * m_kernel_cols <= index)
+      return false;
+
+    y = index % m_kernel_cols;
+    x = index / m_kernel_cols;
+    return true;
+  }
+
+  virtual interleaves::PackingArguments get_kernel_packing_arguments(void) const
+  {
+    return interleaves::PackingArguments(
+      m_kernel_rows, m_kernel_cols, sizeof(TWeight),
+      false, sizeof(TAccum), true,  // Don't pack the bias
+      m_vl_type, sizeof(TAccum), 1,  // Accumulator depth of 1 TODO
+      [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+      { return this->get_kernel_packing_point(idx, x, y); }
+    );
+  }
+
+  public:
+  PlanarStrategy(
+    unsigned int kernel_rows, unsigned int kernel_cols,
+    unsigned int stride_rows, unsigned int stride_cols,
+    unsigned int output_rows,
+    arm_gemm::VLType vl_type
+  ) : m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
+      m_stride_rows(stride_rows), m_stride_cols(stride_cols),
+      m_output_rows(output_rows), m_vl_type(vl_type)
+  {
+  }
+
+  unsigned int get_output_rows(void) const override { return m_output_rows; }
+  arm_gemm::VLType get_vl_type(void) const override { return m_vl_type; }
+
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleaves::get_storage_size_generic(this->get_kernel_packing_arguments(), args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer,
+    const void *biases, const OutputStage &,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleaves::pack_parameters_generic(
+      this->get_kernel_packing_arguments(), args,
+      buffer, biases, weights, ld_weight_col, ld_weight_row
+    );
+  }
+
+  using KernelType = typename PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
+  virtual KernelType get_kernel(void) const = 0;
+};
+
+
+namespace {
+
+template <typename T>
+struct OutputRowPtrsElement
+{
+  struct Workspace
+  {
+    T **output_row_ptrs;
+    size_t *output_ld_cols;
+    size_t *output_ld_vls;  // Stride between vectors of channels
+    T *output_padding_buffer;
+  };
+
+  template <typename OutputStage>
+  static size_t get_element_size(const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
+  {
+    // We need one pointer and stride for each row of output, and an additional
+    // blob of memory into which padded stores can go.
+    return args.strategy->get_output_rows() * (sizeof(T *) + 2*sizeof(size_t)) +
+           get_vector_length<char>(args.strategy->get_vl_type());
+  }
+
+  template <typename WorkspaceType, typename OutputStage>
+  static void *initialise(WorkspaceType *ws, void *buffer,
+                          const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
+  {
+    const auto n_rows = args.strategy->get_output_rows();
+    ws->output_row_ptrs = reinterpret_cast<T **>(buffer);
+    ws->output_ld_cols = reinterpret_cast<size_t *>(ws->output_row_ptrs + n_rows);
+    ws->output_ld_vls = ws->output_ld_cols + n_rows;
+    ws->output_padding_buffer = reinterpret_cast<T *>(ws->output_ld_vls + n_rows);
+    return ws->output_padding_buffer + get_vector_length<T>(args.strategy->get_vl_type());
+  }
+};
+
+}  // namespace {anonymous}
+
+
+template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
+          typename TAccum=typename DefaultTAccum<TOutput>::Type,
+          typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput>
+{
+  using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
+  using StrategyType = IPlanarStrategy<OutputStage>;
+  using WorkspaceManager = Workspace<
+    OutputRowPtrsElement<TOutput>,
+    ActivationsElement<TAccum, OutputStage>
+  >;
+  using WorkspaceType = typename WorkspaceManager::WorkspaceType;
+
+  std::unique_ptr<StrategyType> m_strat;
+  const TAccum *m_bias;
+  OutputStage m_os;
+
+  public:
+  DepthwisePlanar(StrategyType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
+  : Parent(args), m_strat(strat), m_bias(nullptr), m_os(os)
+  {
+  }
+
+  DepthwisePlanar(DepthwisePlanar &) = delete;
+  DepthwisePlanar &operator=(DepthwisePlanar &) = delete;
+
+  size_t get_storage_size(void) const override
+  {
+    return m_strat->get_storage_size(this->m_args);
+  }
+
+  void pack_parameters(
+    void *buffer, const void *biases,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) override
+  {
+    m_strat->pack_parameters(this->m_args, buffer, biases, {}, weights, ld_weight_col, ld_weight_row);
+    this->m_bias = reinterpret_cast<const TAccum *>(biases);
+    depthwise_depthfirst::stash_bias(this->m_os, biases);
+  }
+
+  size_t get_working_size(unsigned int n_threads) const override
+  {
+    return this->get_working_size_per_thread() * n_threads;
+  }
+
+  protected:
+  /* Compute the amount of working space required for a single thread. */
+  virtual size_t get_working_size_per_thread(void) const
+  {
+    return WorkspaceManager::get_sizeof_workspace(
+      WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os));
+  }
+
+  /* Initialise the working space for a thread. */
+  virtual void initialise_working_space(void *buffer) const
+  {
+    WorkspaceManager::initialise(
+      buffer,
+      WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os)
+    );
+  }
+
+  /* Execute the kernel for a given chunk of work. */
+  virtual void execute_kernel(
+    const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+    unsigned int pad_top, unsigned int valid_input_rows,
+    unsigned int pad_left, unsigned int valid_input_cols,
+    const TWeight *weights, const TAccum *bias,
+    TOutput *outptr, size_t ld_out_row, size_t ld_out_col, size_t ld_out_vl,
+    unsigned int valid_output_rows, unsigned int valid_output_cols,
+    unsigned int first_channel, unsigned int valid_channels,
+    WorkspaceType *ws
+  ) const
+  {
+    // Initialise the output pointers
+    for (auto i = 0u; i < m_strat->get_output_rows(); i++)
+    {
+      // Point at the output tensor for all valid rows; otherwise point at the
+      // padding buffer.
+      ws->output_row_ptrs[i] = i < valid_output_rows ? outptr : ws->output_padding_buffer;
+      ws->output_ld_cols[i] = i < valid_output_rows ? ld_out_col : 0;
+      ws->output_ld_vls[i] = i < valid_output_rows ? ld_out_vl : 0;
+      outptr += ld_out_row;
+    }
+
+    // Execute the kernel
+    PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::template execute<WorkspaceType>(
+      reinterpret_cast<const PlanarStrategy<TInput, TWeight, TOutput, TAccum, OutputStage> *>(m_strat.get())->get_kernel(),
+      inptr, ld_in_row, ld_in_col, ld_in_vl,
+      pad_top, valid_input_rows, pad_left, valid_input_cols,
+      weights, bias,
+      ws->output_row_ptrs, ws->output_ld_cols, ws->output_ld_vls,
+      valid_output_cols, first_channel, valid_channels,
+      this->m_os, ws
+    );
+  }
+
+  void execute_internal(
+    const DepthwiseArgs &args,
+    const void *input,
+    size_t ld_input_col,
+    size_t ld_input_row,
+    size_t ld_input_batch,
+    const void *parameters,
+    void *output,
+    size_t ld_output_col,
+    size_t ld_output_row,
+    size_t ld_output_batch,
+    void *working_space,
+    unsigned int thread_id,
+    unsigned int n_threads
+  ) const override
+  {
+    // Get and initialise the working space for this thread.
+    void *thread_working_space =
+      static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
+    this->initialise_working_space(thread_working_space);
+    auto ws = reinterpret_cast<WorkspaceType *>(thread_working_space);
+
+    const auto n_output_channels = args.input_channels * args.channel_multiplier;
+    const auto vl = get_vector_length<TAccum>(m_strat->get_vl_type());
+
+    // Get typed pointers
+    auto input_batch = reinterpret_cast<const TInput *>(input);
+    auto output_batch = reinterpret_cast<TOutput *>(output);
+    auto weights = reinterpret_cast<const TWeight *>(parameters);
+
+    // Iterate over batches
+    for (auto batches = args.n_batches; batches; batches--)
+    {
+      // NOTE: Other loop orderings are possible and it would be worth
+      // investigating them.
+
+      // Within a batch, stripe threads across rows.
+      for (auto start_output_i = thread_id * m_strat->get_output_rows();
+           start_output_i < args.output_rows;
+           start_output_i += n_threads * m_strat->get_output_rows())
+      {
+        // Determine what (if any padding) is required on the top/bottom of
+        // this row of the convolution.
+        const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
+        const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0;
+        const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i;
+        const unsigned int valid_input_rows = input_i > args.input_rows ? 0 : args.input_rows - input_i;
+        const unsigned int valid_output_rows = args.output_rows - start_output_i;
+
+        auto inptr_row = input_batch + input_i*ld_input_row;
+        auto outptr_row = output_batch + start_output_i * ld_output_row;
+
+        // Execute the kernel
+        this->execute_kernel(
+          inptr_row, ld_input_row, ld_input_col, vl,
+          input_pad_top, valid_input_rows, args.padding.left, args.input_cols,
+          weights, this->m_bias,
+          outptr_row, ld_output_row, ld_output_col, vl,
+          valid_output_rows, args.output_cols,
+          0 /* first channel */, n_output_channels,
+          ws
+        );
+      }
+
+      // Update the input and output pointers to account for batch
+      input_batch += ld_input_batch;
+      output_batch += ld_output_batch;
+    }
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
new file mode 100644
index 0000000000..6ecdc36bf0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_planar.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp"
+#include "kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include "kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+#include "kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif  // defined(__aarch64__)
+
+#include <cstdint>
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+#if defined(__aarch64__)
+bool qp_weights_are_symmetric(const DepthwiseArgs &, const void *_qp)
+{
+  const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+  return qp->b_offset == 0;
+}
+
+uint64_t not_preferred(const DepthwiseArgs &, const Requantize32 &)
+{
+  return std::numeric_limits<uint64_t>::max();
+}
+#endif // defined(__aarch64__)
+}
+
+static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depthwise_s8q_methods[] = {
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_s8q_planar_3x3_s1_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_s8q_planar_3x3_s1_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sme2_s8q_planar_3x3_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_s8q_planar_3x3_s2_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_s8q_planar_3x3_s2_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sme2_s8q_planar_3x3_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_s8q_planar_5x5_s1_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_s8q_planar_5x5_s1_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sme2_s8q_planar_5x5_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_s8q_planar_5x5_s2_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_s8q_planar_5x5_s2_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sme2_s8q_planar_5x5_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<int8_t>(strat, args, qp);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             qp_weights_are_symmetric,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             has_channel_multiplier,
+                             cpu_has_sve2),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             has_channel_multiplier,
+                             cpu_has_sve2),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             qp_weights_are_symmetric,
+                             qp_has_no_left_shift,
+                             cpu_has_dot_product),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_dot_product),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_nhwc_generic_output3x3_mla_depthfirst",
+    nullptr,
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto kernel = new a64_s8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstStrategy<int8_t>(kernel, 3, 3, args);
+      return new DepthwiseDepthfirstGeneric<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             has_channel_multiplier,
+                             cpu_has_dot_product),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             has_channel_multiplier,
+                             cpu_has_dot_product),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    constraint<Requantize32>(has_channel_multiplier),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto kern = new a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstMultiplierStrategy<int8_t>(kern, args);
+      return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, true>(strat, args, qp);
+    },
+  },
+#endif  // defined(__aarch64__)
+  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
+};
+
+template <>
+const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> *depthwise_implementation_list()
+{
+  return depthwise_s8q_methods;
+}
+
+template UniqueDepthwiseCommon<int8_t, int8_t, int8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
+template std::vector<KernelDescription> get_compatible_kernels<int8_t, int8_t, int8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp
new file mode 100644
index 0000000000..37892b6963
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "depthwise_strategies_common.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+unsigned int DepthfirstStrategyUntyped::get_input_rows() const
+{
+  return this->get_kernel_rows() + (this->get_output_rows() - 1) * this->get_stride_rows();
+}
+
+unsigned int DepthfirstStrategyUntyped::get_input_cols() const
+{
+  return this->get_kernel_cols() + (this->get_output_cols() - 1) * this->get_stride_cols();
+}
+
+unsigned int DepthfirstStrategyUntyped::get_n_input_points() const { return this->get_input_rows() * this->get_input_cols(); }
+unsigned int DepthfirstStrategyUntyped::get_n_output_points() const { return this->get_output_rows() * this->get_output_cols(); }
+unsigned int DepthfirstStrategyUntyped::get_n_kernel_points() const { return this->get_kernel_rows() * this->get_kernel_cols(); }
+
+bool DepthfirstStrategyUntyped::uses_premultiply() const { return true; }
+
+unsigned int DepthfirstStrategyUntyped::get_accumulator_depth_vl() const { return 1; }
+
+bool DepthfirstStrategyUntyped::get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const
+{
+  // Get the kernel point to pack at the given index; return false to
+  // indicate that this index, and all greater indices, is out of range.
+  if (index < (this->get_kernel_cols() * this->get_kernel_rows()))
+  {
+    y = index % this->get_kernel_cols();
+    x = index / this->get_kernel_cols();
+    return true;
+  }
+  return false;
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp
new file mode 100644
index 0000000000..19cf26dd2f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "utils.hpp"
+#include "interleaves/generic.hpp"
+#include "depthfirst_driver.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+class DepthfirstStrategyUntyped : public IDepthfirstStrategy
+{
+  public:
+  virtual arm_gemm::VLType get_vl_type() const = 0;
+
+  virtual unsigned int get_kernel_rows() const = 0;
+  virtual unsigned int get_kernel_cols() const = 0;
+
+  virtual unsigned int get_stride_rows() const = 0;
+  virtual unsigned int get_stride_cols() const = 0;
+
+  virtual unsigned int get_input_rows() const override;
+  virtual unsigned int get_input_cols() const override;
+
+  virtual unsigned int get_n_input_points() const;
+  virtual unsigned int get_n_output_points() const;
+  virtual unsigned int get_n_kernel_points() const;
+
+  virtual bool uses_premultiply() const;
+
+  // Get the number of VLs used in the accumulator, this defaults to 1.
+  virtual unsigned int get_accumulator_depth_vl() const;
+
+  // Get the order in which to pack the weights, this defaults to a row-major
+  // sweep over the weight tensor.
+  virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const;
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+class DepthfirstStrategy : public DepthfirstStrategyUntyped
+{
+  public:
+  virtual size_t get_storage_size(const DepthwiseArgs &args) const
+  {
+    interleaves::PackingArguments packing_args(
+      this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+      true, sizeof(TAccum), this->uses_premultiply(),
+      this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
+      [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+      { return this->get_kernel_packing_point(idx, x, y); }
+    );
+    return interleaves::get_storage_size_generic(packing_args, args);
+  }
+
+  virtual void pack_parameters(
+    const DepthwiseArgs &args, void *buffer,
+    const void *biases, const OutputStage &,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const
+  {
+    interleaves::PackingArguments packing_args(
+      this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+      true, sizeof(TAccum), this->uses_premultiply(),
+      this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
+      [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+      { return this->get_kernel_packing_point(idx, x, y); }
+    );
+    interleaves::pack_parameters_generic(
+      packing_args, args, buffer, biases, weights, ld_weight_col, ld_weight_row);
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp
new file mode 100644
index 0000000000..236930ee26
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_planar.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp"
+#include "kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+#include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+
+#include "kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+
+#include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#include "kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+
+#endif  // defined(__aarch64__)
+
+#include <cstdint>
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+#if defined(__aarch64__)
+uint64_t not_preferred(const DepthwiseArgs &, const Requantize32 &)
+{
+  return std::numeric_limits<uint64_t>::max();
+}
+#endif // defined(__aarch64__)
+}
+
+static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> depthwise_u8q_methods[] = {
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_u8q_planar_3x3_s1_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_u8q_planar_3x3_s1_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sme2_u8q_planar_3x3_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_u8q_planar_3x3_s2_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_u8q_planar_3x3_s2_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sme2_u8q_planar_3x3_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_u8q_planar_5x5_s1_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_u8q_planar_5x5_s1_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sme2_u8q_planar_5x5_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_u8q_planar_5x5_s2_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_u8q_planar_5x5_s2_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sme2_u8q_planar_5x5_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<uint8_t>(strat, args, qp);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             has_channel_multiplier,
+                             cpu_has_sve2),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             has_channel_multiplier,
+                             cpu_has_sve2),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             cpu_has_dot_product,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             qp_zero_a_offset,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             qp_zero_a_offset,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             qp_zero_a_offset,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_nhwc_generic_output3x3_mla_depthfirst",
+    nullptr,
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto kernel = new a64_u8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstStrategy<uint8_t>(kernel, 3, 3, args);
+      return new DepthwiseDepthfirstGeneric<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+                             cpu_has_dot_product,
+                             has_channel_multiplier,
+                             qp_has_no_left_shift),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+                             cpu_has_dot_product,
+                             has_channel_multiplier,
+                             qp_has_no_left_shift),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    constraint<Requantize32>(has_channel_multiplier),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto kern = new a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstMultiplierStrategy<uint8_t>(kern, args);
+      return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, true>(strat, args, qp);
+    },
+  },
+
+#endif  // defined(__aarch64__)
+  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
+};
+
+template <>
+const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> *depthwise_implementation_list()
+{
+  return depthwise_u8q_methods;
+}
+
+template UniqueDepthwiseCommon<uint8_t, uint8_t, uint8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
+template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint8_t, uint8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp
new file mode 100644
index 0000000000..a888958b76
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_planar.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp"
+#include "kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+#include "kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif  // defined(__aarch64__)
+
+#include <cstdint>
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+#if defined(__aarch64__)
+uint64_t not_preferred(const DepthwiseArgs &, const Requantize32 &)
+{
+  return std::numeric_limits<uint64_t>::max();
+}
+#endif // defined(__aarch64__)
+}
+
+static const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> depthwise_u8q_methods[] = {
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8s8u8q_nhwc_generic_output3x3_mla_depthfirst",
+    nullptr,
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto kernel = new a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstStrategy<uint8_t, int8_t>(kernel, 3, 3, args);
+      return new DepthwiseDepthfirstGeneric<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    constraint<Requantize32>(has_channel_multiplier),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto kern = new a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstMultiplierStrategy<uint8_t, int8_t>(kern, args);
+      return new DepthwiseDepthfirstMultiplier<uint8_t, int8_t, uint8_t, int32_t, true>(strat, args, qp);
+    },
+  },
+#endif  // defined(__aarch64__)
+  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
+};
+
+template <>
+const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> *depthwise_implementation_list()
+{
+  return depthwise_u8q_methods;
+}
+
+template UniqueDepthwiseCommon<uint8_t, int8_t, uint8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
+template std::vector<KernelDescription> get_compatible_kernels<uint8_t, int8_t, uint8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
new file mode 100644
index 0000000000..3de4bdc1fb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_a64_s8q_3x3_dot
+{
+  static size_t get_packed_size(const DepthwiseArgs &);
+  static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_a64_s8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+  // We store 7 vectors for every <vector_of_ints> of channels.
+  const unsigned int n = arm_gemm::roundup(
+    arm_gemm::iceildiv((long unsigned int) args.input_channels * args.channel_multiplier,
+                       get_vector_length<int32_t>(arm_gemm::VLType::None)), 4lu
+  );
+  return n * 7 * get_vector_length<int8_t>(arm_gemm::VLType::None);
+}
+
+void interleave_a64_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+  __asm__ __volatile__(
+    "cmp %x[ld_weight_col], XZR\n"
+    "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+    "movi v16.4s, #0x9\n"
+    "movi v31.16b, #0x0\n"
+    "mov x21, #0x3\n"
+    "mul x21, %x[ld_weight_col], x21\n"
+    "add x20, %x[qp], %[offsetof_input_offset]\n"
+    "ld1r { v30.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_weights_offset]\n"
+    "ld1r { v29.4s }, [x20]\n"
+    "cmp %x[ld_weight_row], XZR\n"
+    "mul v29.4s, v29.4s, v30.4s\n"
+    "csel %x[ld_weight_row], %x[ld_weight_row], x21, NE\n"
+    "lsr x21, %x[n_channels], #0x2\n"
+    "movi v28.16b, #0x1\n"
+    "mul v29.4s, v29.4s, v16.4s\n"
+    "add x25, %x[weights], %x[ld_weight_row]\n"
+    "add x20, %x[qp], %[offsetof_per_layer_mul]\n"
+    "ld1r { v27.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_per_layer_right_shift]\n"
+    "ld1r { v26.4s }, [x20]\n"
+    "add x24, x25, %x[ld_weight_row]\n"
+    "add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
+    "mov x22, #0x0\n"
+    "cbz x21, 4f\n"
+    "1:"  // Loop
+    "movi v25.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ldr q25, [%x[bias], x22]\n"
+    "2:"  // Loop: Skip bias load
+    "ldr s19, [%x[weights], #0x0]\n"
+    "ldr s16, [%x[weights], %x[ld_weight_col]]\n"
+    "zip1 v17.16b, v16.16b, v31.16b\n"
+    "movi v21.4s, #0x0\n"
+    "ldr s16, [%x[weights], x23]\n"
+    "ldr s18, [x25, #0x0]\n"
+    "zip1 v16.16b, v19.16b, v16.16b\n"
+    "zip1 v20.16b, v16.16b, v17.16b\n"
+    "ldr s17, [x25, %x[ld_weight_col]]\n"
+    "ldr s16, [x25, x23]\n"
+    "zip1 v18.16b, v18.16b, v16.16b\n"
+    "zip1 v16.16b, v17.16b, v31.16b\n"
+    "ldr s17, [x24, #0x0]\n"
+    "ldr s19, [x24, %x[ld_weight_col]]\n"
+    ".inst 0x4e949795  // sdot v21.4s, v28.16b, v20.16b\n"
+    "zip1 v18.16b, v18.16b, v16.16b\n"
+    "ldr s16, [x24, x23]\n"
+    "zip1 v17.16b, v17.16b, v16.16b\n"
+    "zip1 v16.16b, v19.16b, v31.16b\n"
+    ".inst 0x4e929795  // sdot v21.4s, v28.16b, v18.16b\n"
+    "zip1 v16.16b, v17.16b, v16.16b\n"
+    ".inst 0x4e909795  // sdot v21.4s, v28.16b, v16.16b\n"
+    "add %x[weights], %x[weights], #0x4\n"
+    "add x25, x25, #0x4\n"
+    "mls v25.4s, v21.4s, v30.4s\n"
+    "add x24, x24, #0x4\n"
+    "add v25.4s, v25.4s, v29.4s\n"
+    "str q25, [%x[outptr], #0x0]\n"
+    "str q20, [%x[outptr], #0x10]\n"
+    "str q18, [%x[outptr], #0x20]\n"
+    "str q16, [%x[outptr], #0x30]\n"
+    "add %x[outptr], %x[outptr], #0x40\n"
+    "cbz %x[rq_mul_perchannel], 3f\n"
+    "ldr q27, [%x[rq_mul_perchannel], x22]\n"
+    "ldr q26, [%x[rq_shift_perchannel], x22]\n"
+    "3:"  // Loop: Quantisation parameters: Store
+    "subs x21, x21, #0x1\n"
+    "str q27, [%x[outptr], #0x0]\n"
+    "add x22, x22, #0x10\n"
+    "str q26, [%x[outptr], #0x10]\n"
+    "add %x[outptr], %x[outptr], #0x20\n"
+    "bgt 1b\n"
+    "tst %x[n_channels], #0x3\n"
+    "beq 13f\n"
+    "4:"  // Oddments
+    "movi v25.4s, #0x0\n"
+    "cbz %x[bias], 7f\n"
+    "add %x[bias], %x[bias], x22\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v25.d }[0], [%x[bias]], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v25.s }[2], [%x[bias]], #0x4\n"
+    "b 6f\n"
+    "5:"  // Oddments: Load bias: Bit 1: Unset
+    "ld1 { v25.s }[0], [%x[bias]], #0x4\n"
+    "6:"  // Oddments: Load bias: Bit 1: End
+    "7:"  // Oddments: Skip bias load
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v17.h }[0], [%x[weights]]\n"
+    "ld1 { v24.h }[0], [x25]\n"
+    "add x21, %x[weights], %x[ld_weight_col]\n"
+    "add x20, %x[weights], x23\n"
+    "ld1 { v20.h }[0], [x21]\n"
+    "ld1 { v16.h }[0], [x20]\n"
+    "add x21, x25, %x[ld_weight_col]\n"
+    "add x20, x25, x23\n"
+    "ld1 { v19.h }[0], [x21]\n"
+    "ld1 { v18.h }[0], [x20]\n"
+    "add x21, x24, %x[ld_weight_col]\n"
+    "add x20, x24, x23\n"
+    "ld1 { v23.h }[0], [x24]\n"
+    "ld1 { v22.h }[0], [x21]\n"
+    "add %x[weights], %x[weights], #0x2\n"
+    "add x25, x25, #0x2\n"
+    "ld1 { v21.h }[0], [x20]\n"
+    "add x24, x24, #0x2\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v17.b }[2], [%x[weights]]\n"
+    "ld1 { v24.b }[2], [x25]\n"
+    "add x21, %x[weights], %x[ld_weight_col]\n"
+    "add x20, %x[weights], x23\n"
+    "ld1 { v20.b }[2], [x21]\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "add x21, x25, %x[ld_weight_col]\n"
+    "add x20, x25, x23\n"
+    "ld1 { v19.b }[2], [x21]\n"
+    "ld1 { v18.b }[2], [x20]\n"
+    "add x21, x24, %x[ld_weight_col]\n"
+    "add x20, x24, x23\n"
+    "ld1 { v23.b }[2], [x24]\n"
+    "ld1 { v22.b }[2], [x21]\n"
+    "add %x[weights], %x[weights], #0x1\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load weights: Bit 1: Unset
+    "ld1 { v17.b }[0], [%x[weights]]\n"
+    "ld1 { v24.b }[0], [x25]\n"
+    "add x21, %x[weights], %x[ld_weight_col]\n"
+    "add x20, %x[weights], x23\n"
+    "ld1 { v20.b }[0], [x21]\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "add x21, x25, %x[ld_weight_col]\n"
+    "add x20, x25, x23\n"
+    "ld1 { v19.b }[0], [x21]\n"
+    "ld1 { v18.b }[0], [x20]\n"
+    "add x21, x24, %x[ld_weight_col]\n"
+    "add x20, x24, x23\n"
+    "ld1 { v23.b }[0], [x24]\n"
+    "ld1 { v22.b }[0], [x21]\n"
+    "add %x[weights], %x[weights], #0x1\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "9:"  // Oddments: Load weights: Bit 1: End
+    "zip1 v17.16b, v17.16b, v16.16b\n"
+    "zip1 v16.16b, v20.16b, v31.16b\n"
+    "zip1 v20.16b, v17.16b, v16.16b\n"
+    "zip1 v17.16b, v24.16b, v18.16b\n"
+    "zip1 v16.16b, v19.16b, v31.16b\n"
+    "movi v19.4s, #0x0\n"
+    ".inst 0x4e949793  // sdot v19.4s, v28.16b, v20.16b\n"
+    "zip1 v18.16b, v17.16b, v16.16b\n"
+    "zip1 v17.16b, v23.16b, v21.16b\n"
+    ".inst 0x4e929793  // sdot v19.4s, v28.16b, v18.16b\n"
+    "zip1 v16.16b, v22.16b, v31.16b\n"
+    "zip1 v16.16b, v17.16b, v16.16b\n"
+    ".inst 0x4e909793  // sdot v19.4s, v28.16b, v16.16b\n"
+    "mls v25.4s, v19.4s, v30.4s\n"
+    "add v25.4s, v25.4s, v29.4s\n"
+    "str q25, [%x[outptr], #0x0]\n"
+    "str q20, [%x[outptr], #0x10]\n"
+    "str q18, [%x[outptr], #0x20]\n"
+    "str q16, [%x[outptr], #0x30]\n"
+    "add %x[outptr], %x[outptr], #0x40\n"
+    "cbz %x[rq_mul_perchannel], 12f\n"
+    "add x21, %x[rq_mul_perchannel], x22\n"
+    "add x20, %x[rq_shift_perchannel], x22\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v27.d }[0], [x21], #0x8\n"
+    "ld1 { v26.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v27.s }[2], [x21], #0x4\n"
+    "ld1 { v26.s }[2], [x20], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset
+    "ld1 { v27.s }[0], [x21], #0x4\n"
+    "ld1 { v26.s }[0], [x20], #0x4\n"
+    "11:"  // Oddments: Quantisation parameters: Load quant params: Bit 1: End
+    "12:"  // Oddments: Quantisation parameters: Store
+    "str q27, [%x[outptr], #0x0]\n"
+    "str q26, [%x[outptr], #0x10]\n"
+    "add %x[outptr], %x[outptr], #0x20\n"
+    "13:"  // End
+    : [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+    : [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
new file mode 100644
index 0000000000..19264c9fce
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_a64_u8q_3x3_dot
+{
+  static size_t get_packed_size(const DepthwiseArgs &);
+  static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_a64_u8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+  // We store 7 vectors for every <vector_of_ints> of channels.
+  const unsigned int n = arm_gemm::roundup(
+    arm_gemm::iceildiv((long unsigned int) args.input_channels * args.channel_multiplier,
+                       get_vector_length<int32_t>(arm_gemm::VLType::None)), 4lu
+  );
+  return n * 7 * get_vector_length<uint8_t>(arm_gemm::VLType::None);
+}
+
+void interleave_a64_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+  __asm__ __volatile__(
+    "cmp %x[ld_weight_col], XZR\n"
+    "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+    "movi v16.4s, #0x9\n"
+    "movi v31.16b, #0x0\n"
+    "mov x21, #0x3\n"
+    "mul x21, %x[ld_weight_col], x21\n"
+    "add x20, %x[qp], %[offsetof_input_offset]\n"
+    "ld1r { v30.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_weights_offset]\n"
+    "ld1r { v29.4s }, [x20]\n"
+    "cmp %x[ld_weight_row], XZR\n"
+    "mul v29.4s, v29.4s, v30.4s\n"
+    "csel %x[ld_weight_row], %x[ld_weight_row], x21, NE\n"
+    "lsr x21, %x[n_channels], #0x2\n"
+    "movi v28.16b, #0x1\n"
+    "mul v29.4s, v29.4s, v16.4s\n"
+    "add x25, %x[weights], %x[ld_weight_row]\n"
+    "add x20, %x[qp], %[offsetof_per_layer_mul]\n"
+    "ld1r { v27.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_per_layer_right_shift]\n"
+    "ld1r { v26.4s }, [x20]\n"
+    "add x24, x25, %x[ld_weight_row]\n"
+    "add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
+    "mov x22, #0x0\n"
+    "cbz x21, 4f\n"
+    "1:"  // Loop
+    "movi v25.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ldr q25, [%x[bias], x22]\n"
+    "2:"  // Loop: Skip bias load
+    "ldr s19, [%x[weights], #0x0]\n"
+    "ldr s16, [%x[weights], %x[ld_weight_col]]\n"
+    "zip1 v17.16b, v16.16b, v31.16b\n"
+    "movi v21.4s, #0x0\n"
+    "ldr s16, [%x[weights], x23]\n"
+    "ldr s18, [x25, #0x0]\n"
+    "zip1 v16.16b, v19.16b, v16.16b\n"
+    "zip1 v20.16b, v16.16b, v17.16b\n"
+    "ldr s17, [x25, %x[ld_weight_col]]\n"
+    "ldr s16, [x25, x23]\n"
+    "zip1 v18.16b, v18.16b, v16.16b\n"
+    "zip1 v16.16b, v17.16b, v31.16b\n"
+    "ldr s17, [x24, #0x0]\n"
+    "ldr s19, [x24, %x[ld_weight_col]]\n"
+    ".inst 0x6e949795  // udot v21.4s, v28.16b, v20.16b\n"
+    "zip1 v18.16b, v18.16b, v16.16b\n"
+    "ldr s16, [x24, x23]\n"
+    "zip1 v17.16b, v17.16b, v16.16b\n"
+    "zip1 v16.16b, v19.16b, v31.16b\n"
+    ".inst 0x6e929795  // udot v21.4s, v28.16b, v18.16b\n"
+    "zip1 v16.16b, v17.16b, v16.16b\n"
+    ".inst 0x6e909795  // udot v21.4s, v28.16b, v16.16b\n"
+    "add %x[weights], %x[weights], #0x4\n"
+    "add x25, x25, #0x4\n"
+    "mls v25.4s, v21.4s, v30.4s\n"
+    "add x24, x24, #0x4\n"
+    "add v25.4s, v25.4s, v29.4s\n"
+    "str q25, [%x[outptr], #0x0]\n"
+    "str q20, [%x[outptr], #0x10]\n"
+    "str q18, [%x[outptr], #0x20]\n"
+    "str q16, [%x[outptr], #0x30]\n"
+    "add %x[outptr], %x[outptr], #0x40\n"
+    "cbz %x[rq_mul_perchannel], 3f\n"
+    "ldr q27, [%x[rq_mul_perchannel], x22]\n"
+    "ldr q26, [%x[rq_shift_perchannel], x22]\n"
+    "3:"  // Loop: Quantisation parameters: Store
+    "subs x21, x21, #0x1\n"
+    "str q27, [%x[outptr], #0x0]\n"
+    "add x22, x22, #0x10\n"
+    "str q26, [%x[outptr], #0x10]\n"
+    "add %x[outptr], %x[outptr], #0x20\n"
+    "bgt 1b\n"
+    "tst %x[n_channels], #0x3\n"
+    "beq 13f\n"
+    "4:"  // Oddments
+    "movi v25.4s, #0x0\n"
+    "cbz %x[bias], 7f\n"
+    "add %x[bias], %x[bias], x22\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v25.d }[0], [%x[bias]], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v25.s }[2], [%x[bias]], #0x4\n"
+    "b 6f\n"
+    "5:"  // Oddments: Load bias: Bit 1: Unset
+    "ld1 { v25.s }[0], [%x[bias]], #0x4\n"
+    "6:"  // Oddments: Load bias: Bit 1: End
+    "7:"  // Oddments: Skip bias load
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v17.h }[0], [%x[weights]]\n"
+    "ld1 { v24.h }[0], [x25]\n"
+    "add x21, %x[weights], %x[ld_weight_col]\n"
+    "add x20, %x[weights], x23\n"
+    "ld1 { v20.h }[0], [x21]\n"
+    "ld1 { v16.h }[0], [x20]\n"
+    "add x21, x25, %x[ld_weight_col]\n"
+    "add x20, x25, x23\n"
+    "ld1 { v19.h }[0], [x21]\n"
+    "ld1 { v18.h }[0], [x20]\n"
+    "add x21, x24, %x[ld_weight_col]\n"
+    "add x20, x24, x23\n"
+    "ld1 { v23.h }[0], [x24]\n"
+    "ld1 { v22.h }[0], [x21]\n"
+    "add %x[weights], %x[weights], #0x2\n"
+    "add x25, x25, #0x2\n"
+    "ld1 { v21.h }[0], [x20]\n"
+    "add x24, x24, #0x2\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v17.b }[2], [%x[weights]]\n"
+    "ld1 { v24.b }[2], [x25]\n"
+    "add x21, %x[weights], %x[ld_weight_col]\n"
+    "add x20, %x[weights], x23\n"
+    "ld1 { v20.b }[2], [x21]\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "add x21, x25, %x[ld_weight_col]\n"
+    "add x20, x25, x23\n"
+    "ld1 { v19.b }[2], [x21]\n"
+    "ld1 { v18.b }[2], [x20]\n"
+    "add x21, x24, %x[ld_weight_col]\n"
+    "add x20, x24, x23\n"
+    "ld1 { v23.b }[2], [x24]\n"
+    "ld1 { v22.b }[2], [x21]\n"
+    "add %x[weights], %x[weights], #0x1\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load weights: Bit 1: Unset
+    "ld1 { v17.b }[0], [%x[weights]]\n"
+    "ld1 { v24.b }[0], [x25]\n"
+    "add x21, %x[weights], %x[ld_weight_col]\n"
+    "add x20, %x[weights], x23\n"
+    "ld1 { v20.b }[0], [x21]\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "add x21, x25, %x[ld_weight_col]\n"
+    "add x20, x25, x23\n"
+    "ld1 { v19.b }[0], [x21]\n"
+    "ld1 { v18.b }[0], [x20]\n"
+    "add x21, x24, %x[ld_weight_col]\n"
+    "add x20, x24, x23\n"
+    "ld1 { v23.b }[0], [x24]\n"
+    "ld1 { v22.b }[0], [x21]\n"
+    "add %x[weights], %x[weights], #0x1\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "9:"  // Oddments: Load weights: Bit 1: End
+    "zip1 v17.16b, v17.16b, v16.16b\n"
+    "zip1 v16.16b, v20.16b, v31.16b\n"
+    "zip1 v20.16b, v17.16b, v16.16b\n"
+    "zip1 v17.16b, v24.16b, v18.16b\n"
+    "zip1 v16.16b, v19.16b, v31.16b\n"
+    "movi v19.4s, #0x0\n"
+    ".inst 0x6e949793  // udot v19.4s, v28.16b, v20.16b\n"
+    "zip1 v18.16b, v17.16b, v16.16b\n"
+    "zip1 v17.16b, v23.16b, v21.16b\n"
+    ".inst 0x6e929793  // udot v19.4s, v28.16b, v18.16b\n"
+    "zip1 v16.16b, v22.16b, v31.16b\n"
+    "zip1 v16.16b, v17.16b, v16.16b\n"
+    ".inst 0x6e909793  // udot v19.4s, v28.16b, v16.16b\n"
+    "mls v25.4s, v19.4s, v30.4s\n"
+    "add v25.4s, v25.4s, v29.4s\n"
+    "str q25, [%x[outptr], #0x0]\n"
+    "str q20, [%x[outptr], #0x10]\n"
+    "str q18, [%x[outptr], #0x20]\n"
+    "str q16, [%x[outptr], #0x30]\n"
+    "add %x[outptr], %x[outptr], #0x40\n"
+    "cbz %x[rq_mul_perchannel], 12f\n"
+    "add x21, %x[rq_mul_perchannel], x22\n"
+    "add x20, %x[rq_shift_perchannel], x22\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v27.d }[0], [x21], #0x8\n"
+    "ld1 { v26.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v27.s }[2], [x21], #0x4\n"
+    "ld1 { v26.s }[2], [x20], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset
+    "ld1 { v27.s }[0], [x21], #0x4\n"
+    "ld1 { v26.s }[0], [x20], #0x4\n"
+    "11:"  // Oddments: Quantisation parameters: Load quant params: Bit 1: End
+    "12:"  // Oddments: Quantisation parameters: Store
+    "str q27, [%x[outptr], #0x0]\n"
+    "str q26, [%x[outptr], #0x10]\n"
+    "add %x[outptr], %x[outptr], #0x20\n"
+    "13:"  // End
+    : [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+    : [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp
new file mode 100644
index 0000000000..dc505a013d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "generic.hpp"
+
+#include <functional>
+
+namespace arm_conv {
+namespace depthwise {
+namespace interleaves {
+
+PackingArguments::PackingArguments(
+  unsigned int kernel_rows, unsigned int kernel_cols, size_t weight_element_size,
+  bool include_bias, size_t bias_element_size, bool premultiply,
+  arm_gemm::VLType vl_type, size_t accumulator_element_size, unsigned int accumulator_depth_vl,
+  std::function<bool(unsigned int, unsigned int &, unsigned int &)> get_weight_pos
+) : kernel_rows(kernel_rows), kernel_cols(kernel_cols), weight_element_size(weight_element_size),
+    include_bias(include_bias), bias_element_size(bias_element_size), premultiply(premultiply),
+    vl_type(vl_type), accumulator_element_size(accumulator_element_size), accumulator_depth_vl(accumulator_depth_vl),
+    get_weight_pos(get_weight_pos)
+{
+}
+
+size_t get_storage_size_generic(const PackingArguments &packing_args, const DepthwiseArgs &args)
+{
+  // If the channel multiplier is greater than one, then we treat this as a
+  // repeated packing of `channel_multiplier`-sized problems.
+  if (args.channel_multiplier > 1 && !packing_args.premultiply)
+  {
+    DepthwiseArgs args_per_input_channel(args);
+    args_per_input_channel.input_channels = args.channel_multiplier;
+    args_per_input_channel.channel_multiplier = 1;
+
+    return args.input_channels * get_storage_size_generic(packing_args, args_per_input_channel);
+  }
+
+  const unsigned int vl =
+    packing_args.accumulator_depth_vl *
+    arm_gemm::utils::get_vector_length<uint8_t>(packing_args.vl_type) / packing_args.accumulator_element_size;
+  const unsigned int n_packs = arm_gemm::iceildiv(args.input_channels * args.channel_multiplier, vl);
+  const auto pack_size = (packing_args.include_bias ? packing_args.bias_element_size : 0) +
+                         packing_args.kernel_points() * packing_args.weight_element_size;
+  return n_packs * pack_size * vl;
+}
+
+void pack_parameters_generic(
+  const PackingArguments &packing_args,
+  const DepthwiseArgs &args,
+  void *buffer_raw,
+  const void *biases_raw,
+  const void *weights_raw,
+  size_t ld_weight_col,
+  size_t ld_weight_row
+)
+{
+  // Cast the pointers to byte sizes
+  auto *buffer = static_cast<uint8_t *>(buffer_raw);
+  auto *biases = static_cast<const uint8_t *>(biases_raw);
+  auto *weights = static_cast<const uint8_t *>(weights_raw);
+
+  // If the channel multiplier is greater than one, then we treat this as a
+  // repeated packing of `channel_multiplier`-sized problems.
+  if (args.channel_multiplier > 1 && !packing_args.premultiply)
+  {
+    // Get a modified copy of the depthwise arguments
+    DepthwiseArgs args_per_input_channel(args);
+    args_per_input_channel.input_channels = args.channel_multiplier;
+    args_per_input_channel.channel_multiplier = 1;
+
+    // Resolve the strides here
+    ld_weight_col = ld_weight_col ? ld_weight_col : args.input_channels * args.channel_multiplier;
+    ld_weight_row = ld_weight_row ? ld_weight_row : ld_weight_col * packing_args.kernel_cols;
+
+    auto per_input_channel_size = get_storage_size_generic(packing_args, args_per_input_channel);
+
+    for (unsigned int c = 0; c < args.input_channels; c++)
+    {
+      pack_parameters_generic(
+        packing_args, args_per_input_channel, buffer, biases, weights, ld_weight_col, ld_weight_row);
+
+      // Update the pointers
+      buffer += per_input_channel_size;
+      biases += (biases == nullptr) ? 0 : packing_args.bias_element_size * args.channel_multiplier;
+      weights += packing_args.weight_element_size * args.channel_multiplier;
+    }
+    return;
+  }
+
+  auto input_channels = args.input_channels * args.channel_multiplier;
+
+  // Finalise the weight strides
+  ld_weight_col = (ld_weight_col == 0) ? input_channels : ld_weight_col;
+  ld_weight_row = (ld_weight_row == 0) ? packing_args.kernel_cols * ld_weight_col : ld_weight_row;
+
+  const unsigned int vl =
+    packing_args.accumulator_depth_vl *
+    arm_gemm::utils::get_vector_length<uint8_t>(packing_args.vl_type) / packing_args.accumulator_element_size;
+
+  for (unsigned int n = 0; n < input_channels; n += vl)
+  {
+    const unsigned int todo = std::min(vl, input_channels - n);
+
+    if (packing_args.include_bias)
+    {
+      if (biases != nullptr)
+      {
+        memcpy(buffer, biases, todo * packing_args.bias_element_size);
+        biases += todo * packing_args.bias_element_size;
+      }
+      else
+      {
+        memset(buffer, 0, vl * packing_args.bias_element_size);
+      }
+
+      buffer += vl * packing_args.bias_element_size;
+    }
+
+    // Copy each of the weights in turn
+    unsigned int kx, ky;
+    for (int kindex = 0; packing_args.get_weight_pos(kindex, kx, ky); kindex++)
+    {
+      const auto src_ptr = weights + (kx*ld_weight_row + ky*ld_weight_col + n) * packing_args.weight_element_size;
+      memcpy(buffer, src_ptr, todo * packing_args.weight_element_size);
+      buffer += vl * packing_args.weight_element_size;
+    }
+  }
+}
+
+}  // namespace interleaves
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp
new file mode 100644
index 0000000000..1842f10150
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "utils.hpp"
+#include "depthwise.hpp"
+
+#include <functional>
+
+namespace arm_conv {
+namespace depthwise {
+namespace interleaves {
+
+struct PackingArguments
+{
+  const unsigned int kernel_rows;
+  const unsigned int kernel_cols;
+  const size_t weight_element_size;
+  const bool include_bias;
+  const size_t bias_element_size;
+  const bool premultiply;
+  arm_gemm::VLType vl_type;
+  const size_t accumulator_element_size;
+  const unsigned int accumulator_depth_vl;
+  std::function<bool(unsigned int, unsigned int &, unsigned int &)> get_weight_pos;
+
+  unsigned int kernel_points(void) const { return kernel_cols * kernel_rows; }
+
+  PackingArguments(
+    unsigned int kernel_rows,
+    unsigned int kernel_cols,
+    size_t weight_element_size,
+    bool include_bias,
+    size_t bias_element_size,
+    bool premultiply,
+    arm_gemm::VLType vl_type,
+    size_t accumulator_element_size,
+    unsigned int accumulator_depth_vl,
+    std::function<bool(unsigned int, unsigned int &, unsigned int &)> get_weight_pos
+  );
+};
+
+size_t get_storage_size_generic(
+  const PackingArguments &packing_args,
+  const DepthwiseArgs &args
+);
+
+void pack_parameters_generic(
+  const PackingArguments &packing_args,
+  const DepthwiseArgs &args,
+  void *buffer_raw,
+  const void *biases_raw,
+  const void *weights_raw,
+  size_t ld_weight_col,
+  size_t ld_weight_row
+);
+
+}  // namespace interleaves
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp
new file mode 100644
index 0000000000..a6389054d1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "generic_quantized_dot_product.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+namespace interleaves {
+namespace quantized {
+
+size_t get_storage_size(
+  const DepthwiseArgs &args,
+  const arm_gemm::VLType vl_type,
+  const unsigned int accumulator_depth_vl
+)
+{
+  // We produce VL<int32_t> channels at a time, for each of these blocks of
+  // channels we store a vector of biases, weights (complicated) and
+  // requantize parameters.
+  const unsigned int iter_length = accumulator_depth_vl * arm_gemm::utils::get_vector_length<int32_t>(vl_type);
+  const unsigned int n_iters = args.input_channels * arm_gemm::iceildiv(args.channel_multiplier, iter_length);
+
+  // Compute the cost of storing the weights
+  const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(args.kernel_cols, 4u);
+
+  return n_iters * iter_length * (
+    sizeof(int32_t) +  // Bias
+    4 * n_dots_per_kernel_row * args.kernel_rows * sizeof(int8_t) +  // Weights
+    2 * sizeof(int32_t)  // Requantisation parameters
+  );
+}
+
+template <typename T>
+void pack_parameters(
+  void *_buffer, const int32_t *biases,
+  const T *weights, size_t ld_weight_col, size_t ld_weight_row,
+  const DepthwiseArgs &args,
+  const arm_gemm::Requantize32 &qp,
+  const arm_gemm::VLType vl_type,
+  const unsigned int accumulator_depth_vl
+)
+{
+  auto buffer = static_cast<uint8_t *>(_buffer);
+  auto requant_muls = qp.per_channel_muls;
+  auto requant_shifts = qp.per_channel_right_shifts;
+
+  const unsigned int iter_length = accumulator_depth_vl * arm_gemm::utils::get_vector_length<int32_t>(vl_type);
+  const unsigned int n_iters_per_input_channel = arm_gemm::iceildiv(args.channel_multiplier, iter_length);
+  const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(args.kernel_cols, 4u);
+
+  const size_t iter_stride = iter_length * (
+      sizeof(int32_t) +  // Bias
+      4 * n_dots_per_kernel_row * args.kernel_rows * sizeof(T) +  // Weights
+      2 * sizeof(int32_t)  // Requantisation parameters
+  );
+
+  ld_weight_col = (ld_weight_col == 0) ? args.input_channels * args.channel_multiplier : ld_weight_col;
+  ld_weight_row = (ld_weight_row == 0) ? args.kernel_cols * ld_weight_col : ld_weight_row;
+
+  for (unsigned int input_channel = 0; input_channel < args.input_channels; input_channel++)
+  {
+    auto buffer_input_channel = buffer + input_channel * n_iters_per_input_channel * iter_stride;
+    auto weights_input_channel = weights + input_channel * args.channel_multiplier;
+
+    for (unsigned int iter = 0; iter < n_iters_per_input_channel; iter++)
+    {
+      // Get a pointer to the start of this portion of the buffer; consequently
+      // derive pointers to the bias, weight and requantisation portions of
+      // this frame.
+      auto buffer_base = buffer_input_channel + iter_stride * iter;
+      auto buffer_biases = reinterpret_cast<int32_t *>(buffer_base);
+      auto buffer_weights = buffer_base + sizeof(int32_t) * iter_length;
+      auto buffer_requant_mul = reinterpret_cast<int32_t *>(
+        buffer_weights + args.kernel_rows * n_dots_per_kernel_row * 4 * iter_length);
+      auto buffer_requant_shift = buffer_requant_mul + iter_length;
+      auto weights_base = weights_input_channel + iter * iter_length;
+
+      // Hence work through the data for this iteration, on a
+      // channel-by-channel basis.
+      const auto this_iter_length = std::min<unsigned int>(
+        iter_length, args.channel_multiplier - iter * iter_length
+      );
+      for (unsigned int i = 0; i < this_iter_length; i++)
+      {
+        auto weights_channel = weights_base + i;
+
+        // Read the bias value, we modify this as we read the weights.
+        auto bias_value = biases == nullptr ? 0 : *(biases++);
+        int32_t elements_sum = 0;
+
+        // Read through the kernel; for each row, marshal together as many dot
+        // product terms as are required.
+        for (unsigned int ki = 0; ki < args.kernel_rows; ki++)
+        {
+          auto buffer_row = buffer_weights + i*4 + ki * 4 * n_dots_per_kernel_row * iter_length;
+          auto weights_row = weights_channel + ki * ld_weight_row;
+
+          unsigned int kj = 0;
+          for (; kj < args.kernel_cols; kj++)
+          {
+            // Determine which element to which we're writing
+            const auto dot = kj / 4;
+            const auto elem = kj % 4;
+
+            // Copy the value; include in the sum
+            const auto val = weights_row[kj * ld_weight_col];
+            buffer_row[dot * 4 * iter_length + elem] = val;
+            elements_sum += val;
+          }
+          for (; kj < 4 * n_dots_per_kernel_row; kj++)
+          {
+            const auto dot = kj / 4;
+            const auto elem = kj % 4;
+            buffer_row[dot * 4 * iter_length + elem] = 0;
+          }
+
+          buffer_row += 4 * n_dots_per_kernel_row * iter_length;
+        }
+
+        // Write back the bias and offset values
+        *(buffer_biases++) =
+          bias_value - qp.a_offset * elements_sum +
+          args.kernel_rows * args.kernel_cols * qp.a_offset * qp.b_offset;
+
+        // Write out the requantisation parameters
+        *(buffer_requant_mul++) = qp.per_channel_requant ? *(requant_muls++) : qp.per_layer_mul;
+        *(buffer_requant_shift++) = qp.per_channel_requant ? *(requant_shifts++) : qp.per_layer_right_shift;
+      }
+    }
+  }
+}
+
+template void pack_parameters(void *, const int32_t *, const int8_t *, size_t, size_t, const DepthwiseArgs &, const arm_gemm::Requantize32 &, arm_gemm::VLType, unsigned int);
+template void pack_parameters(void *, const int32_t *, const uint8_t *, size_t, size_t, const DepthwiseArgs &, const arm_gemm::Requantize32 &, arm_gemm::VLType, unsigned int);
+
+}  // namespace quantized
+}  // namespace interleaves
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp
new file mode 100644
index 0000000000..779d67d3f4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "generic.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+namespace interleaves {
+namespace quantized {
+
+size_t get_storage_size(
+  const DepthwiseArgs &args,
+  arm_gemm::VLType vl_type,
+  unsigned int accumulator_depth_vl=1
+);
+
+template <typename T>
+void pack_parameters(
+  void *buffer, const int32_t *biases,
+  const T *weights, size_t ld_weight_col, size_t ld_weight_row,
+  const DepthwiseArgs &args,
+  const arm_gemm::Requantize32 &qp,
+  arm_gemm::VLType vl_type,
+  unsigned int accumulator_depth_vl
+);
+
+}  // namespace quantized
+}  // namespace interleaves
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp
new file mode 100644
index 0000000000..76f38eb335
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+struct interleave_sve_u8q_3x3_dot
+{
+  static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+  static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+struct interleave_sve_s8q_3x3_dot
+{
+  static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+  static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+
+struct interleave_a64_u8q_3x3_dot
+{
+  static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+  static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+struct interleave_a64_s8q_3x3_dot
+{
+  static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+  static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
new file mode 100644
index 0000000000..5d7b54f235
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_sve_s8q_3x3_dot
+{
+  static size_t get_packed_size(const DepthwiseArgs &);
+  static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_sve_s8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+  // We store 7 vectors for every <vector_of_ints> of channels.
+  const unsigned int n = arm_gemm::roundup(
+    arm_gemm::iceildiv((long unsigned int) args.input_channels * args.channel_multiplier,
+                       get_vector_length<int32_t>(arm_gemm::VLType::SVE)), 4lu
+  );
+  return n * 7 * get_vector_length<int8_t>(arm_gemm::VLType::SVE);
+}
+
+void interleave_sve_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+  __asm__ __volatile__(
+    "cmp %x[ld_weight_col], XZR\n"
+    "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+    "mov z16.s, #0x9\n"
+    "mov z28.b, #0x0\n"
+    "mov x20, #0x3\n"
+    "ptrue p2.b\n"
+    "mul x20, %x[ld_weight_col], x20\n"
+    "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_input_offset]]\n"
+    "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_weights_offset]]\n"
+    "cmp %x[ld_weight_row], XZR\n"
+    "csel %x[ld_weight_row], %x[ld_weight_row], x20, NE\n"
+    "mov z25.b, #0x1\n"
+    "mul z26.s, p2/M, z26.s, z27.s\n"
+    "add x24, %x[weights], %x[ld_weight_row]\n"
+    "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_per_layer_mul]]\n"
+    "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_per_layer_right_shift]]\n"
+    "add x23, x24, %x[ld_weight_row]\n"
+    "add x22, %x[ld_weight_col], %x[ld_weight_col]\n"
+    "whilelt p1.s, XZR, %x[n_channels]\n"
+    "mov x21, #0x0\n"
+    "mul z26.s, p2/M, z26.s, z16.s\n"
+    "pfalse p8.b\n"
+    "cbz %x[bias], 1f\n"
+    "ptrue p8.s\n"
+    "1:"  // No bias
+    "2:"  // Loop
+    "cntp x20, p2, p1.s\n"
+    "whilelt p0.b, XZR, x20\n"
+    "ld1b { z18.b }, p0/Z, [%x[weights]]\n"
+    "ld1b { z17.b }, p0/Z, [%x[weights], %x[ld_weight_col]]\n"
+    "ld1b { z16.b }, p0/Z, [%x[weights], x22]\n"
+    "zip1 z20.b, z18.b, z16.b\n"
+    "zip1 z19.b, z17.b, z28.b\n"
+    "ld1b { z18.b }, p0/Z, [x24]\n"
+    "ld1b { z17.b }, p0/Z, [x24, %x[ld_weight_col]]\n"
+    "ld1b { z16.b }, p0/Z, [x24, x22]\n"
+    "zip1 z22.b, z20.b, z19.b\n"
+    "zip1 z21.b, z18.b, z16.b\n"
+    "zip1 z19.b, z17.b, z28.b\n"
+    "mov z20.s, #0x0\n"
+    "ld1b { z18.b }, p0/Z, [x23]\n"
+    "ld1b { z17.b }, p0/Z, [x23, %x[ld_weight_col]]\n"
+    "ld1b { z16.b }, p0/Z, [x23, x22]\n"
+    "sdot z20.s, z25.b, z22.b\n"
+    "zip1 z19.b, z21.b, z19.b\n"
+    "sdot z20.s, z25.b, z19.b\n"
+    "zip1 z18.b, z18.b, z16.b\n"
+    "zip1 z16.b, z17.b, z28.b\n"
+    "and p0.b, p2/Z, p8.b, p1.b\n"
+    "ld1w { z17.s }, p0/Z, [%x[bias], x21, LSL #2]\n"
+    "zip1 z16.b, z18.b, z16.b\n"
+    "sdot z20.s, z25.b, z16.b\n"
+    "mls z17.s, p2/M, z20.s, z27.s\n"
+    "add %x[weights], %x[weights], x20\n"
+    "add x24, x24, x20\n"
+    "add x23, x23, x20\n"
+    "add z17.s, z17.s, z26.s\n"
+    "st1w { z17.s }, p2, [%x[outptr]]\n"
+    "st1b { z22.b }, p2, [%x[outptr], #1, MUL VL]\n"
+    "st1b { z19.b }, p2, [%x[outptr], #2, MUL VL]\n"
+    "st1b { z16.b }, p2, [%x[outptr], #3, MUL VL]\n"
+    "addvl %x[outptr], %x[outptr], #4\n"
+    "cbz %x[rq_mul_perchannel], 3f\n"
+    "ld1w { z24.s }, p1/Z, [%x[rq_mul_perchannel], x21, LSL #2]\n"
+    "ld1w { z23.s }, p1/Z, [%x[rq_shift_perchannel], x21, LSL #2]\n"
+    "3:"  // Loop: Quantisation parameters: Store
+    "incw x21\n"
+    "whilelt p1.s, x21, %x[n_channels]\n"
+    "st1w { z24.s }, p2, [%x[outptr]]\n"
+    "st1w { z23.s }, p2, [%x[outptr], #1, MUL VL]\n"
+    "addvl %x[outptr], %x[outptr], #2\n"
+    "b.any 2b\n"
+    : [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+    : [bias] "r" (bias), [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "p0", "p1", "p2", "p8", "x20", "x21", "x22", "x23", "x24", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
new file mode 100644
index 0000000000..c3da81448b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_sve_u8q_3x3_dot
+{
+  static size_t get_packed_size(const DepthwiseArgs &);
+  static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_sve_u8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+  // We store 7 vectors for every <vector_of_ints> of channels.
+  const unsigned int n = arm_gemm::roundup(
+    arm_gemm::iceildiv((long unsigned int) args.input_channels * args.channel_multiplier,
+                       get_vector_length<int32_t>(arm_gemm::VLType::SVE)), 4lu
+  );
+  return n * 7 * get_vector_length<uint8_t>(arm_gemm::VLType::SVE);
+}
+
+void interleave_sve_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+  __asm__ __volatile__(
+    "cmp %x[ld_weight_col], XZR\n"
+    "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+    "mov z16.s, #0x9\n"
+    "mov z28.b, #0x0\n"
+    "mov x20, #0x3\n"
+    "ptrue p2.b\n"
+    "mul x20, %x[ld_weight_col], x20\n"
+    "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_input_offset]]\n"
+    "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_weights_offset]]\n"
+    "cmp %x[ld_weight_row], XZR\n"
+    "csel %x[ld_weight_row], %x[ld_weight_row], x20, NE\n"
+    "mov z25.b, #0x1\n"
+    "mul z26.s, p2/M, z26.s, z27.s\n"
+    "add x24, %x[weights], %x[ld_weight_row]\n"
+    "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_per_layer_mul]]\n"
+    "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_per_layer_right_shift]]\n"
+    "add x23, x24, %x[ld_weight_row]\n"
+    "add x22, %x[ld_weight_col], %x[ld_weight_col]\n"
+    "whilelt p1.s, XZR, %x[n_channels]\n"
+    "mov x21, #0x0\n"
+    "mul z26.s, p2/M, z26.s, z16.s\n"
+    "pfalse p8.b\n"
+    "cbz %x[bias], 1f\n"
+    "ptrue p8.s\n"
+    "1:"  // No bias
+    "2:"  // Loop
+    "cntp x20, p2, p1.s\n"
+    "whilelt p0.b, XZR, x20\n"
+    "ld1b { z18.b }, p0/Z, [%x[weights]]\n"
+    "ld1b { z17.b }, p0/Z, [%x[weights], %x[ld_weight_col]]\n"
+    "ld1b { z16.b }, p0/Z, [%x[weights], x22]\n"
+    "zip1 z20.b, z18.b, z16.b\n"
+    "zip1 z19.b, z17.b, z28.b\n"
+    "ld1b { z18.b }, p0/Z, [x24]\n"
+    "ld1b { z17.b }, p0/Z, [x24, %x[ld_weight_col]]\n"
+    "ld1b { z16.b }, p0/Z, [x24, x22]\n"
+    "zip1 z22.b, z20.b, z19.b\n"
+    "zip1 z21.b, z18.b, z16.b\n"
+    "zip1 z19.b, z17.b, z28.b\n"
+    "mov z20.s, #0x0\n"
+    "ld1b { z18.b }, p0/Z, [x23]\n"
+    "ld1b { z17.b }, p0/Z, [x23, %x[ld_weight_col]]\n"
+    "ld1b { z16.b }, p0/Z, [x23, x22]\n"
+    "udot z20.s, z25.b, z22.b\n"
+    "zip1 z19.b, z21.b, z19.b\n"
+    "udot z20.s, z25.b, z19.b\n"
+    "zip1 z18.b, z18.b, z16.b\n"
+    "zip1 z16.b, z17.b, z28.b\n"
+    "and p0.b, p2/Z, p8.b, p1.b\n"
+    "ld1w { z17.s }, p0/Z, [%x[bias], x21, LSL #2]\n"
+    "zip1 z16.b, z18.b, z16.b\n"
+    "udot z20.s, z25.b, z16.b\n"
+    "mls z17.s, p2/M, z20.s, z27.s\n"
+    "add %x[weights], %x[weights], x20\n"
+    "add x24, x24, x20\n"
+    "add x23, x23, x20\n"
+    "add z17.s, z17.s, z26.s\n"
+    "st1w { z17.s }, p2, [%x[outptr]]\n"
+    "st1b { z22.b }, p2, [%x[outptr], #1, MUL VL]\n"
+    "st1b { z19.b }, p2, [%x[outptr], #2, MUL VL]\n"
+    "st1b { z16.b }, p2, [%x[outptr], #3, MUL VL]\n"
+    "addvl %x[outptr], %x[outptr], #4\n"
+    "cbz %x[rq_mul_perchannel], 3f\n"
+    "ld1w { z24.s }, p1/Z, [%x[rq_mul_perchannel], x21, LSL #2]\n"
+    "ld1w { z23.s }, p1/Z, [%x[rq_shift_perchannel], x21, LSL #2]\n"
+    "3:"  // Loop: Quantisation parameters: Store
+    "incw x21\n"
+    "whilelt p1.s, x21, %x[n_channels]\n"
+    "st1w { z24.s }, p2, [%x[outptr]]\n"
+    "st1w { z23.s }, p2, [%x[outptr], #1, MUL VL]\n"
+    "addvl %x[outptr], %x[outptr], #2\n"
+    "b.any 2b\n"
+    : [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+    : [bias] "r" (bias), [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "p0", "p1", "p2", "p8", "x20", "x21", "x22", "x23", "x24", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6beaba841f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..d8ca3d7437
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,723 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x23, #0x0\n"
+    "mov x22, #0x0\n"
+    "1:"  // Tile loop
+    "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x27, #0x2\n"
+    "mov x26, #0x2\n"
+    "str x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x23, x25\n"  // offset = tile_i * ld_input_row
+    "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x23, x24\n"  // offset = tile_i * ld_output_row
+    "mov x23, #0x10\n"  // cntb _, ALL, #1
+    "madd x21, x22, x15, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "lsl x15, x15, #0x1\n"
+    "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "madd x20, x22, x14, x20\n"  // offset += tile_j * ld_output_col
+    "lsr x22, %x[n_channels], #0x3\n"
+    "add x11, x15, x15\n"
+    "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mul x21, x21, x27\n"  // offset *= kernel_stride * output_size
+    "add x13, x13, x21, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x9, x13, x25, LSL #1\n"
+    "mul x20, x20, x26\n"  // offset *= output_tile_size
+    "add x28, x9, x25, LSL #1\n"
+    "add x12, x12, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v27.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v26.8h }, [x20]\n"
+    "add x27, x28, x25, LSL #1\n"
+    "add x26, x11, x15\n"
+    "add x25, x12, x24, LSL #1\n"
+    "lsl x14, x14, #0x1\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x23\n"
+    "cbz x22, 4f\n"
+    "ldr q25, [x10, #0x0]\n"
+    "ldr q0, [x10, #0x10]\n"
+    "cmp x23, x22, LSL #4\n"
+    "ldr q1, [x10, #0x20]\n"
+    "ldr q2, [x10, #0x30]\n"
+    "ldr q3, [x10, #0x40]\n"
+    "ldr q4, [x10, #0x50]\n"
+    "ldr q5, [x10, #0x60]\n"
+    "ldr q6, [x10, #0x70]\n"
+    "ldr q7, [x10, #0x80]\n"
+    "ldr q8, [x10, #0x90]\n"
+    "add x10, x10, #0xa0\n"
+    "ldr q9, [x9, x15]\n"
+    "ld1 { v10.8h }, [x13]\n"
+    "ldr q11, [x13, x26]\n"
+    "ldr q12, [x9, x11]\n"
+    "ldr q13, [x28, x15]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+    "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+    "add x23, x23, #0x10\n"
+    "cmp x23, x22, LSL #4\n"
+    "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+    "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "ld1 { v18.8h }, [x27]\n"
+    "ldr q25, [x10, #0x0]\n"
+    "fmla v24.8h, v0.8h, v10.8h\n"
+    "ldr q20, [x28, x11]\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "ldr q17, [x27, x26]\n"
+    "fmla v22.8h, v2.8h, v12.8h\n"
+    "fmla v21.8h, v1.8h, v12.8h\n"
+    "add x20, x20, #0x10\n"
+    "add x21, x21, #0x10\n"
+    "fmla v24.8h, v5.8h, v12.8h\n"
+    "fmla v23.8h, v4.8h, v12.8h\n"
+    "ldr q16, [x13, x15]\n"
+    "fmla v22.8h, v6.8h, v18.8h\n"
+    "ldr q18, [x13, x11]\n"
+    "fmla v21.8h, v3.8h, v13.8h\n"
+    "add x13, x13, #0x10\n"
+    "fmla v24.8h, v7.8h, v13.8h\n"
+    "fmla v23.8h, v6.8h, v13.8h\n"
+    "fmla v22.8h, v4.8h, v13.8h\n"
+    "fmla v21.8h, v8.8h, v17.8h\n"
+    "ld1 { v17.8h }, [x9]\n"
+    "fmla v24.8h, v1.8h, v16.8h\n"
+    "fmla v23.8h, v0.8h, v16.8h\n"
+    "ldr q16, [x9, x26]\n"
+    "add x9, x9, #0x10\n"
+    "fmla v22.8h, v5.8h, v20.8h\n"
+    "fmla v21.8h, v4.8h, v20.8h\n"
+    "ldr q4, [x10, #0x50]\n"
+    "fmla v24.8h, v2.8h, v18.8h\n"
+    "fmla v23.8h, v1.8h, v18.8h\n"
+    "ld1 { v19.8h }, [x28]\n"
+    "ldr q1, [x10, #0x20]\n"
+    "fmla v22.8h, v0.8h, v17.8h\n"
+    "ldr q0, [x10, #0x10]\n"
+    "fmla v21.8h, v2.8h, v16.8h\n"
+    "ldr q2, [x10, #0x30]\n"
+    "fmla v24.8h, v8.8h, v20.8h\n"
+    "fmla v23.8h, v7.8h, v20.8h\n"
+    "ldr q18, [x28, x26]\n"
+    "add x28, x28, #0x10\n"
+    "ldr q13, [x28, x15]\n"
+    "fmla v22.8h, v3.8h, v19.8h\n"
+    "fmla v21.8h, v5.8h, v18.8h\n"
+    "fmla v24.8h, v3.8h, v17.8h\n"
+    "ldr q17, [x27, x15]\n"
+    "ldr q3, [x10, #0x40]\n"
+    "fmla v23.8h, v5.8h, v16.8h\n"
+    "ldr q16, [x27, x11]\n"
+    "ldr q5, [x10, #0x60]\n"
+    "fmla v22.8h, v7.8h, v17.8h\n"
+    "fmla v21.8h, v6.8h, v17.8h\n"
+    "ldr q11, [x13, x26]\n"
+    "fmla v24.8h, v6.8h, v19.8h\n"
+    "ldr q9, [x9, x15]\n"
+    "fmla v23.8h, v8.8h, v18.8h\n"
+    "ld1 { v10.8h }, [x13]\n"
+    "ldr q6, [x10, #0x70]\n"
+    "fmla v22.8h, v8.8h, v16.8h\n"
+    "fmla v21.8h, v7.8h, v16.8h\n"
+    "ldr q12, [x9, x11]\n"
+    "ldr q7, [x10, #0x80]\n"
+    "fmax v24.8h, v24.8h, v27.8h\n"
+    "fmax v23.8h, v23.8h, v27.8h\n"
+    "ldr q8, [x10, #0x90]\n"
+    "fmax v22.8h, v22.8h, v27.8h\n"
+    "fmax v21.8h, v21.8h, v27.8h\n"
+    "add x27, x27, #0x10\n"
+    "fmin v24.8h, v24.8h, v26.8h\n"
+    "fmin v23.8h, v23.8h, v26.8h\n"
+    "st1 { v24.8h }, [x12]\n"
+    "add x10, x10, #0xa0\n"
+    "fmin v22.8h, v22.8h, v26.8h\n"
+    "fmin v21.8h, v21.8h, v26.8h\n"
+    "str q23, [x12, x14]\n"
+    "add x12, x12, #0x10\n"
+    "st1 { v22.8h }, [x25]\n"
+    "str q21, [x25, x14]\n"
+    "add x25, x25, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+    "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+    "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+    "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "ld1 { v18.8h }, [x27]\n"
+    "fmla v24.8h, v0.8h, v10.8h\n"
+    "ldr q20, [x28, x11]\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "ldr q17, [x27, x26]\n"
+    "fmla v22.8h, v2.8h, v12.8h\n"
+    "fmla v21.8h, v1.8h, v12.8h\n"
+    "fmla v24.8h, v5.8h, v12.8h\n"
+    "fmla v23.8h, v4.8h, v12.8h\n"
+    "ldr q16, [x13, x15]\n"
+    "fmla v22.8h, v6.8h, v18.8h\n"
+    "ldr q18, [x13, x11]\n"
+    "fmla v21.8h, v3.8h, v13.8h\n"
+    "add x13, x13, #0x10\n"
+    "fmla v24.8h, v7.8h, v13.8h\n"
+    "fmla v23.8h, v6.8h, v13.8h\n"
+    "fmla v22.8h, v4.8h, v13.8h\n"
+    "fmla v21.8h, v8.8h, v17.8h\n"
+    "ld1 { v17.8h }, [x9]\n"
+    "fmla v24.8h, v1.8h, v16.8h\n"
+    "fmla v23.8h, v0.8h, v16.8h\n"
+    "ldr q16, [x9, x26]\n"
+    "add x9, x9, #0x10\n"
+    "fmla v22.8h, v5.8h, v20.8h\n"
+    "fmla v21.8h, v4.8h, v20.8h\n"
+    "fmla v24.8h, v2.8h, v18.8h\n"
+    "fmla v23.8h, v1.8h, v18.8h\n"
+    "ld1 { v19.8h }, [x28]\n"
+    "fmla v22.8h, v0.8h, v17.8h\n"
+    "fmla v21.8h, v2.8h, v16.8h\n"
+    "fmla v24.8h, v8.8h, v20.8h\n"
+    "fmla v23.8h, v7.8h, v20.8h\n"
+    "ldr q18, [x28, x26]\n"
+    "add x28, x28, #0x10\n"
+    "fmla v22.8h, v3.8h, v19.8h\n"
+    "fmla v21.8h, v5.8h, v18.8h\n"
+    "fmla v24.8h, v3.8h, v17.8h\n"
+    "ldr q17, [x27, x15]\n"
+    "fmla v23.8h, v5.8h, v16.8h\n"
+    "ldr q16, [x27, x11]\n"
+    "fmla v22.8h, v7.8h, v17.8h\n"
+    "fmla v21.8h, v6.8h, v17.8h\n"
+    "add x27, x27, #0x10\n"
+    "fmla v24.8h, v6.8h, v19.8h\n"
+    "fmla v23.8h, v8.8h, v18.8h\n"
+    "fmax v24.8h, v24.8h, v27.8h\n"
+    "fmla v22.8h, v8.8h, v16.8h\n"
+    "fmla v21.8h, v7.8h, v16.8h\n"
+    "fmax v23.8h, v23.8h, v27.8h\n"
+    "fmax v22.8h, v22.8h, v27.8h\n"
+    "fmax v21.8h, v21.8h, v27.8h\n"
+    "fmin v24.8h, v24.8h, v26.8h\n"
+    "fmin v23.8h, v23.8h, v26.8h\n"
+    "st1 { v24.8h }, [x12]\n"
+    "fmin v22.8h, v22.8h, v26.8h\n"
+    "fmin v21.8h, v21.8h, v26.8h\n"
+    "str q23, [x12, x14]\n"
+    "add x12, x12, #0x10\n"
+    "st1 { v22.8h }, [x25]\n"
+    "str q21, [x25, x14]\n"
+    "add x25, x25, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 57f\n"
+    "ldr q25, [x10, #0x0]\n"
+    "ldr q0, [x10, #0x10]\n"
+    "add x24, x9, x15\n"
+    "add x23, x13, XZR\n"
+    "ldr q1, [x10, #0x20]\n"
+    "ldr q2, [x10, #0x30]\n"
+    "add x22, x13, x26\n"
+    "add x21, x9, x11\n"
+    "ldr q3, [x10, #0x40]\n"
+    "ldr q4, [x10, #0x50]\n"
+    "add x20, x28, x15\n"
+    "ldr q5, [x10, #0x60]\n"
+    "ldr q6, [x10, #0x70]\n"
+    "ldr q7, [x10, #0x80]\n"
+    "ldr q8, [x10, #0x90]\n"
+    "tbz %x[n_channels], #2, 6f\n"
+    "ldr d9, [x24], #0x8\n"
+    "ldr d10, [x23], #0x8\n"
+    "ldr d11, [x22], #0x8\n"
+    "ldr d12, [x21], #0x8\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v9.s }[2], [x24], #0x4\n"
+    "ld1 { v10.s }[2], [x23], #0x4\n"
+    "ld1 { v11.s }[2], [x22], #0x4\n"
+    "ld1 { v12.s }[2], [x21], #0x4\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[6], [x24]\n"
+    "ld1 { v10.h }[6], [x23]\n"
+    "ld1 { v11.h }[6], [x22]\n"
+    "ld1 { v12.h }[6], [x21]\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 8f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[4], [x24]\n"
+    "ld1 { v10.h }[4], [x23]\n"
+    "ld1 { v11.h }[4], [x22]\n"
+    "ld1 { v12.h }[4], [x21]\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 8f\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr s9, [x24], #0x4\n"
+    "ldr s10, [x23], #0x4\n"
+    "ldr s11, [x22], #0x4\n"
+    "ldr s12, [x21], #0x4\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[2], [x24]\n"
+    "ld1 { v10.h }[2], [x23]\n"
+    "ld1 { v11.h }[2], [x22]\n"
+    "ld1 { v12.h }[2], [x21]\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x24, #0x0]\n"
+    "ldr h10, [x23, #0x0]\n"
+    "ldr h11, [x22, #0x0]\n"
+    "ldr h12, [x21, #0x0]\n"
+    "ldr h13, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: End
+    "mov v28.16b, v25.16b\n fmla v28.8h, v4.8h, v9.8h\n"
+    "mov v29.16b, v25.16b\n fmla v29.8h, v3.8h, v9.8h\n"
+    "add x20, x27, XZR\n"
+    "mov v30.16b, v25.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+    "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v5.8h, v12.8h\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 10f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 12f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 12f\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
+    "fmla v30.8h, v6.8h, v9.8h\n"
+    "fmla v28.8h, v7.8h, v13.8h\n"
+    "add x20, x27, x26\n"
+    "fmla v29.8h, v6.8h, v13.8h\n"
+    "fmla v30.8h, v4.8h, v13.8h\n"
+    "fmla v31.8h, v3.8h, v13.8h\n"
+    "tbz %x[n_channels], #2, 14f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 16f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 16f\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
+    "fmla v31.8h, v8.8h, v11.8h\n"
+    "add x20, x13, x15\n"
+    "tbz %x[n_channels], #2, 18f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 20f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 20f\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: End
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "fmla v29.8h, v0.8h, v12.8h\n"
+    "add x20, x13, x11\n"
+    "tbz %x[n_channels], #2, 22f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 24f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 24f\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: End
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v1.8h, v9.8h\n"
+    "add x20, x28, x11\n"
+    "tbz %x[n_channels], #2, 26f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 28f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 28f\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: End
+    "fmla v28.8h, v8.8h, v10.8h\n"
+    "fmla v29.8h, v7.8h, v10.8h\n"
+    "add x20, x9, XZR\n"
+    "fmla v30.8h, v5.8h, v10.8h\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 30f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 32f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 32f\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: End
+    "fmla v28.8h, v3.8h, v11.8h\n"
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "add x20, x9, x26\n"
+    "tbz %x[n_channels], #2, 34f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 36f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 36f\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
+    "fmla v29.8h, v5.8h, v12.8h\n"
+    "fmla v31.8h, v2.8h, v12.8h\n"
+    "add x20, x28, XZR\n"
+    "tbz %x[n_channels], #2, 38f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 40f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 40f\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
+    "fmla v28.8h, v6.8h, v9.8h\n"
+    "fmla v30.8h, v3.8h, v9.8h\n"
+    "add x20, x28, x26\n"
+    "tbz %x[n_channels], #2, 42f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 44f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 44f\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
+    "fmla v29.8h, v8.8h, v10.8h\n"
+    "fmla v31.8h, v5.8h, v10.8h\n"
+    "add x20, x27, x15\n"
+    "tbz %x[n_channels], #2, 46f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 48f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 48f\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
+    "fmla v30.8h, v7.8h, v11.8h\n"
+    "fmla v31.8h, v6.8h, v11.8h\n"
+    "add x20, x27, x11\n"
+    "tbz %x[n_channels], #2, 50f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 52f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 52f\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
+    "fmla v30.8h, v8.8h, v12.8h\n"
+    "fmla v31.8h, v7.8h, v12.8h\n"
+    "fmax v28.8h, v28.8h, v27.8h\n"
+    "fmax v29.8h, v29.8h, v27.8h\n"
+    "fmax v30.8h, v30.8h, v27.8h\n"
+    "fmax v31.8h, v31.8h, v27.8h\n"
+    "fmin v28.8h, v28.8h, v26.8h\n"
+    "fmin v29.8h, v29.8h, v26.8h\n"
+    "fmin v30.8h, v30.8h, v26.8h\n"
+    "fmin v31.8h, v31.8h, v26.8h\n"
+    "tbz %x[n_channels], #2, 54f\n"
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.d }[0], [x21], x14\n"
+    "st1 { v30.d }[0], [x20], x14\n"
+    "add x12, x12, #0x8\n"
+    "add x25, x25, #0x8\n"
+    "st1 { v29.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.s }[2], [x21], x14\n"
+    "st1 { v30.s }[2], [x20], x14\n"
+    "add x12, x12, #0x4\n"
+    "add x25, x25, #0x4\n"
+    "st1 { v29.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.h }[6], [x21], x14\n"
+    "st1 { v30.h }[6], [x20], x14\n"
+    "st1 { v29.h }[6], [x21]\n"
+    "st1 { v31.h }[6], [x20]\n"
+    "b 56f\n"
+    "53:"  // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 56f\n"
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.h }[4], [x21], x14\n"
+    "st1 { v30.h }[4], [x20], x14\n"
+    "st1 { v29.h }[4], [x21]\n"
+    "st1 { v31.h }[4], [x20]\n"
+    "b 56f\n"
+    "54:"  // Tile loop: Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 55f\n"
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.s }[0], [x21], x14\n"
+    "st1 { v30.s }[0], [x20], x14\n"
+    "add x12, x12, #0x4\n"
+    "add x25, x25, #0x4\n"
+    "st1 { v29.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.h }[2], [x21], x14\n"
+    "st1 { v30.h }[2], [x20], x14\n"
+    "st1 { v29.h }[2], [x21]\n"
+    "st1 { v31.h }[2], [x20]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.h }[0], [x21], x14\n"
+    "st1 { v30.h }[0], [x20], x14\n"
+    "st1 { v29.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "56:"  // Tile loop: Oddments: Store: Bit 2: End
+    "57:"  // Tile loop: End
+    "ldr x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x22, x22, #0x1\n"
+    "add x21, x23, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x22, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x23, x23, x21, LT\n"
+    "csel x22, x22, XZR, LT\n"
+    "cmp x23, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..c9a554e9ad
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,697 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[16];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[5];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[3];
+      inptrs[3] = input_ptrs[6];
+      inptrs[4] = input_ptrs[9];
+      inptrs[5] = input_ptrs[12];
+      inptrs[6] = input_ptrs[15];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[2];
+      inptrs[9] = input_ptrs[10];
+      inptrs[10] = input_ptrs[4];
+      inptrs[11] = input_ptrs[7];
+      inptrs[12] = input_ptrs[8];
+      inptrs[13] = input_ptrs[11];
+      inptrs[14] = input_ptrs[13];
+      inptrs[15] = input_ptrs[14];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "mov x16, #0x10\n"  // cntb _, ALL, #1
+    "lsr x15, %x[n_channels], #0x3\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v27.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v26.8h }, [x20]\n"
+    "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldp x12, x11, [x21, #0x0]\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "mov x28, #0x0\n"
+    "sub x27, XZR, x16\n"
+    "cbz x15, 3f\n"
+    "ldr q25, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "cmp x16, x15, LSL #4\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "add x14, x14, #0xa0\n"
+    "ldp x21, x20, [x13, #0x0]\n"
+    "ldr q9, [x21, x28]\n"
+    "ldr q10, [x20, x28]\n"
+    "ldp x21, x20, [x13, #0x10]\n"
+    "ldr q11, [x21, x28]\n"
+    "ldr q12, [x20, x28]\n"
+    "ldr x20, [x13, #0x20]\n"
+    "ldr q13, [x20, x28]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+    "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+    "ldr x21, [x13, #0x28]\n"
+    "ldr x20, [x13, #0x30]\n"
+    "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+    "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "ldr q18, [x21, x28]\n"
+    "ldr q25, [x14, #0x0]\n"
+    "fmla v24.8h, v0.8h, v10.8h\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x21, [x13, #0x38]\n"
+    "fmla v22.8h, v2.8h, v12.8h\n"
+    "fmla v21.8h, v1.8h, v12.8h\n"
+    "ldr x20, [x13, #0x48]\n"
+    "ldr q20, [x20, x28]\n"
+    "fmla v24.8h, v5.8h, v12.8h\n"
+    "fmla v23.8h, v4.8h, v12.8h\n"
+    "ldr q16, [x21, x28]\n"
+    "ldr x20, [x13, #0x40]\n"
+    "fmla v22.8h, v6.8h, v18.8h\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v21.8h, v3.8h, v13.8h\n"
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v24.8h, v7.8h, v13.8h\n"
+    "fmla v23.8h, v6.8h, v13.8h\n"
+    "ldr x22, [x13, #0x58]\n"
+    "ldr x21, [x13, #0x60]\n"
+    "fmla v22.8h, v4.8h, v13.8h\n"
+    "fmla v21.8h, v8.8h, v17.8h\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x20, [x13, #0x68]\n"
+    "fmla v24.8h, v1.8h, v16.8h\n"
+    "fmla v23.8h, v0.8h, v16.8h\n"
+    "ldr q16, [x22, x28]\n"
+    "ldr x26, [x13, #0x70]\n"
+    "fmla v22.8h, v5.8h, v20.8h\n"
+    "fmla v21.8h, v4.8h, v20.8h\n"
+    "ldr q4, [x14, #0x50]\n"
+    "ldr x25, [x13, #0x78]\n"
+    "fmla v24.8h, v2.8h, v18.8h\n"
+    "fmla v23.8h, v1.8h, v18.8h\n"
+    "ldr q19, [x21, x28]\n"
+    "ldr q1, [x14, #0x20]\n"
+    "fmla v22.8h, v0.8h, v17.8h\n"
+    "ldr q0, [x14, #0x10]\n"
+    "fmla v21.8h, v2.8h, v16.8h\n"
+    "ldr q2, [x14, #0x30]\n"
+    "fmla v24.8h, v8.8h, v20.8h\n"
+    "fmla v23.8h, v7.8h, v20.8h\n"
+    "ldr q18, [x20, x28]\n"
+    "ldp x24, x23, [x13, #0x0]\n"
+    "fmla v22.8h, v3.8h, v19.8h\n"
+    "fmla v21.8h, v5.8h, v18.8h\n"
+    "ldp x22, x21, [x13, #0x10]\n"
+    "ldr x20, [x13, #0x20]\n"
+    "ldr q13, [x20, x16]\n"
+    "fmla v24.8h, v3.8h, v17.8h\n"
+    "ldr q17, [x26, x28]\n"
+    "fmla v23.8h, v5.8h, v16.8h\n"
+    "ldr q16, [x25, x28]\n"
+    "ldr q3, [x14, #0x40]\n"
+    "fmla v22.8h, v7.8h, v17.8h\n"
+    "fmla v21.8h, v6.8h, v17.8h\n"
+    "ldr q11, [x22, x16]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "fmla v24.8h, v6.8h, v19.8h\n"
+    "fmla v23.8h, v8.8h, v18.8h\n"
+    "ldr q9, [x24, x16]\n"
+    "ldr q10, [x23, x16]\n"
+    "fmla v22.8h, v8.8h, v16.8h\n"
+    "fmla v21.8h, v7.8h, v16.8h\n"
+    "ldr q12, [x21, x16]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "fmax v24.8h, v24.8h, v27.8h\n"
+    "fmax v23.8h, v23.8h, v27.8h\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "fmax v22.8h, v22.8h, v27.8h\n"
+    "fmax v21.8h, v21.8h, v27.8h\n"
+    "add x16, x16, #0x10\n"
+    "add x27, x27, #0x10\n"
+    "fmin v24.8h, v24.8h, v26.8h\n"
+    "fmin v23.8h, v23.8h, v26.8h\n"
+    "cmp x16, x15, LSL #4\n"
+    "fmin v22.8h, v22.8h, v26.8h\n"
+    "fmin v21.8h, v21.8h, v26.8h\n"
+    "add x28, x28, #0x10\n"
+    "str q24, [x12, x27]\n"
+    "add x14, x14, #0xa0\n"
+    "str q23, [x11, x27]\n"
+    "str q22, [x10, x27]\n"
+    "str q21, [x9, x27]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+    "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+    "ldr x21, [x13, #0x28]\n"
+    "ldr x20, [x13, #0x30]\n"
+    "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+    "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "ldr q18, [x21, x28]\n"
+    "ldr x21, [x13, #0x38]\n"
+    "fmla v24.8h, v0.8h, v10.8h\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x20, [x13, #0x48]\n"
+    "ldr q20, [x20, x28]\n"
+    "fmla v22.8h, v2.8h, v12.8h\n"
+    "fmla v21.8h, v1.8h, v12.8h\n"
+    "ldr x20, [x13, #0x40]\n"
+    "fmla v24.8h, v5.8h, v12.8h\n"
+    "fmla v23.8h, v4.8h, v12.8h\n"
+    "ldr q16, [x21, x28]\n"
+    "ldr x21, [x13, #0x50]\n"
+    "fmla v22.8h, v6.8h, v18.8h\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v21.8h, v3.8h, v13.8h\n"
+    "ldr x20, [x13, #0x58]\n"
+    "fmla v24.8h, v7.8h, v13.8h\n"
+    "fmla v23.8h, v6.8h, v13.8h\n"
+    "ldr x23, [x13, #0x60]\n"
+    "ldr x22, [x13, #0x68]\n"
+    "fmla v22.8h, v4.8h, v13.8h\n"
+    "fmla v21.8h, v8.8h, v17.8h\n"
+    "ldr q17, [x21, x28]\n"
+    "ldr x21, [x13, #0x70]\n"
+    "fmla v24.8h, v1.8h, v16.8h\n"
+    "fmla v23.8h, v0.8h, v16.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr x20, [x13, #0x78]\n"
+    "fmla v22.8h, v5.8h, v20.8h\n"
+    "fmla v21.8h, v4.8h, v20.8h\n"
+    "add x27, x27, #0x10\n"
+    "fmla v24.8h, v2.8h, v18.8h\n"
+    "fmla v23.8h, v1.8h, v18.8h\n"
+    "ldr q19, [x23, x28]\n"
+    "fmla v22.8h, v0.8h, v17.8h\n"
+    "fmla v21.8h, v2.8h, v16.8h\n"
+    "fmla v24.8h, v8.8h, v20.8h\n"
+    "fmla v23.8h, v7.8h, v20.8h\n"
+    "ldr q18, [x22, x28]\n"
+    "fmla v22.8h, v3.8h, v19.8h\n"
+    "fmla v21.8h, v5.8h, v18.8h\n"
+    "fmla v24.8h, v3.8h, v17.8h\n"
+    "ldr q17, [x21, x28]\n"
+    "fmla v23.8h, v5.8h, v16.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v22.8h, v7.8h, v17.8h\n"
+    "fmla v21.8h, v6.8h, v17.8h\n"
+    "add x28, x28, #0x10\n"
+    "fmla v24.8h, v6.8h, v19.8h\n"
+    "fmla v23.8h, v8.8h, v18.8h\n"
+    "fmax v24.8h, v24.8h, v27.8h\n"
+    "fmla v22.8h, v8.8h, v16.8h\n"
+    "fmla v21.8h, v7.8h, v16.8h\n"
+    "fmax v23.8h, v23.8h, v27.8h\n"
+    "fmax v22.8h, v22.8h, v27.8h\n"
+    "fmax v21.8h, v21.8h, v27.8h\n"
+    "fmin v24.8h, v24.8h, v26.8h\n"
+    "fmin v23.8h, v23.8h, v26.8h\n"
+    "str q24, [x12, x27]\n"
+    "fmin v22.8h, v22.8h, v26.8h\n"
+    "fmin v21.8h, v21.8h, v26.8h\n"
+    "str q23, [x11, x27]\n"
+    "str q22, [x10, x27]\n"
+    "str q21, [x9, x27]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 56f\n"
+    "ldr q25, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "mov x20, x28\n"
+    "add x12, x12, x20\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "add x11, x11, x20\n"
+    "add x10, x10, x20\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "add x9, x9, x20\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "ldr x24, [x13, #0x0]\n"
+    "ldr x23, [x13, #0x8]\n"
+    "add x24, x24, x28\n"
+    "add x23, x23, x28\n"
+    "ldr x22, [x13, #0x10]\n"
+    "ldr x21, [x13, #0x18]\n"
+    "add x22, x22, x28\n"
+    "add x21, x21, x28\n"
+    "ldr x20, [x13, #0x20]\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 5f\n"
+    "ld1 { v9.d }[0], [x24], #0x8\n"
+    "ld1 { v10.d }[0], [x23], #0x8\n"
+    "ld1 { v11.d }[0], [x22], #0x8\n"
+    "ld1 { v12.d }[0], [x21], #0x8\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.s }[2], [x24], #0x4\n"
+    "ld1 { v10.s }[2], [x23], #0x4\n"
+    "ld1 { v11.s }[2], [x22], #0x4\n"
+    "ld1 { v12.s }[2], [x21], #0x4\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[6], [x24], #0x2\n"
+    "ld1 { v10.h }[6], [x23], #0x2\n"
+    "ld1 { v11.h }[6], [x22], #0x2\n"
+    "ld1 { v12.h }[6], [x21], #0x2\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[4], [x24], #0x2\n"
+    "ld1 { v10.h }[4], [x23], #0x2\n"
+    "ld1 { v11.h }[4], [x22], #0x2\n"
+    "ld1 { v12.h }[4], [x21], #0x2\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v9.s }[0], [x24], #0x4\n"
+    "ld1 { v10.s }[0], [x23], #0x4\n"
+    "ld1 { v11.s }[0], [x22], #0x4\n"
+    "ld1 { v12.s }[0], [x21], #0x4\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[2], [x24], #0x2\n"
+    "ld1 { v10.h }[2], [x23], #0x2\n"
+    "ld1 { v11.h }[2], [x22], #0x2\n"
+    "ld1 { v12.h }[2], [x21], #0x2\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x24], #0x2\n"
+    "ld1 { v10.h }[0], [x23], #0x2\n"
+    "ld1 { v11.h }[0], [x22], #0x2\n"
+    "ld1 { v12.h }[0], [x21], #0x2\n"
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "7:"  // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: End
+    "mov v28.16b, v25.16b\n fmla v28.8h, v4.8h, v9.8h\n"
+    "mov v29.16b, v25.16b\n fmla v29.8h, v3.8h, v9.8h\n"
+    "ldr x20, [x13, #0x28]\n"
+    "add x20, x20, x28\n"
+    "mov v30.16b, v25.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+    "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v5.8h, v12.8h\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 9f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 11f\n"
+    "8:"  // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 11f\n"
+    "9:"  // Oddments: Load input (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "11:"  // Oddments: Load input (3, 0): Bit 2: End
+    "fmla v30.8h, v6.8h, v9.8h\n"
+    "ldr x20, [x13, #0x30]\n"
+    "fmla v28.8h, v7.8h, v13.8h\n"
+    "add x20, x20, x28\n"
+    "fmla v29.8h, v6.8h, v13.8h\n"
+    "fmla v30.8h, v4.8h, v13.8h\n"
+    "fmla v31.8h, v3.8h, v13.8h\n"
+    "tbz %x[n_channels], #2, 13f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load input (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "15:"  // Oddments: Load input (3, 3): Bit 2: End
+    "ldr x20, [x13, #0x38]\n"
+    "fmla v31.8h, v8.8h, v11.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 17f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load input (0, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load input (0, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (0, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "19:"  // Oddments: Load input (0, 1): Bit 2: End
+    "ldr x20, [x13, #0x40]\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "fmla v29.8h, v0.8h, v12.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 21f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load input (0, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load input (0, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (0, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "23:"  // Oddments: Load input (0, 2): Bit 2: End
+    "ldr x20, [x13, #0x48]\n"
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v1.8h, v9.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 25f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load input (2, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load input (2, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (2, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "27:"  // Oddments: Load input (2, 2): Bit 2: End
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v28.8h, v8.8h, v10.8h\n"
+    "fmla v29.8h, v7.8h, v10.8h\n"
+    "add x20, x20, x28\n"
+    "fmla v30.8h, v5.8h, v10.8h\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 29f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load input (1, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load input (1, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (1, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "31:"  // Oddments: Load input (1, 0): Bit 2: End
+    "ldr x20, [x13, #0x58]\n"
+    "fmla v28.8h, v3.8h, v11.8h\n"
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 33f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load input (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "35:"  // Oddments: Load input (1, 3): Bit 2: End
+    "ldr x20, [x13, #0x60]\n"
+    "fmla v29.8h, v5.8h, v12.8h\n"
+    "fmla v31.8h, v2.8h, v12.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 37f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load input (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load input (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "39:"  // Oddments: Load input (2, 0): Bit 2: End
+    "ldr x20, [x13, #0x68]\n"
+    "fmla v28.8h, v6.8h, v9.8h\n"
+    "fmla v30.8h, v3.8h, v9.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 41f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load input (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load input (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "43:"  // Oddments: Load input (2, 3): Bit 2: End
+    "ldr x20, [x13, #0x70]\n"
+    "fmla v29.8h, v8.8h, v10.8h\n"
+    "fmla v31.8h, v5.8h, v10.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 45f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load input (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "47:"  // Oddments: Load input (3, 1): Bit 2: End
+    "ldr x20, [x13, #0x78]\n"
+    "fmla v30.8h, v7.8h, v11.8h\n"
+    "fmla v31.8h, v6.8h, v11.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 49f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load input (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "51:"  // Oddments: Load input (3, 2): Bit 2: End
+    "fmla v30.8h, v8.8h, v12.8h\n"
+    "fmla v31.8h, v7.8h, v12.8h\n"
+    "fmax v28.8h, v28.8h, v27.8h\n"
+    "fmax v29.8h, v29.8h, v27.8h\n"
+    "fmax v30.8h, v30.8h, v27.8h\n"
+    "fmax v31.8h, v31.8h, v27.8h\n"
+    "fmin v28.8h, v28.8h, v26.8h\n"
+    "fmin v29.8h, v29.8h, v26.8h\n"
+    "fmin v30.8h, v30.8h, v26.8h\n"
+    "fmin v31.8h, v31.8h, v26.8h\n"
+    "tbz %x[n_channels], #2, 53f\n"
+    "st1 { v28.d }[0], [x12], #0x8\n"
+    "st1 { v29.d }[0], [x11], #0x8\n"
+    "st1 { v30.d }[0], [x10], #0x8\n"
+    "st1 { v31.d }[0], [x9], #0x8\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "st1 { v28.s }[2], [x12], #0x4\n"
+    "st1 { v29.s }[2], [x11], #0x4\n"
+    "st1 { v30.s }[2], [x10], #0x4\n"
+    "st1 { v31.s }[2], [x9], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "st1 { v28.h }[6], [x12], #0x2\n"
+    "st1 { v29.h }[6], [x11], #0x2\n"
+    "st1 { v30.h }[6], [x10], #0x2\n"
+    "st1 { v31.h }[6], [x9], #0x2\n"
+    "b 55f\n"
+    "52:"  // Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 55f\n"
+    "st1 { v28.h }[4], [x12], #0x2\n"
+    "st1 { v29.h }[4], [x11], #0x2\n"
+    "st1 { v30.h }[4], [x10], #0x2\n"
+    "st1 { v31.h }[4], [x9], #0x2\n"
+    "b 55f\n"
+    "53:"  // Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 54f\n"
+    "st1 { v28.s }[0], [x12], #0x4\n"
+    "st1 { v29.s }[0], [x11], #0x4\n"
+    "st1 { v30.s }[0], [x10], #0x4\n"
+    "st1 { v31.s }[0], [x9], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "st1 { v28.h }[2], [x12], #0x2\n"
+    "st1 { v29.h }[2], [x11], #0x2\n"
+    "st1 { v30.h }[2], [x10], #0x2\n"
+    "st1 { v31.h }[2], [x9], #0x2\n"
+    "b 55f\n"
+    "54:"  // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "st1 { v28.h }[0], [x12], #0x2\n"
+    "st1 { v29.h }[0], [x11], #0x2\n"
+    "st1 { v30.h }[0], [x10], #0x2\n"
+    "st1 { v31.h }[0], [x9], #0x2\n"
+    "55:"  // Oddments: Store: Bit 2: End
+    "56:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6bbd3508cb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..4e64a2bf2b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,1158 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x24, #0x0\n"
+    "mov x23, #0x0\n"
+    "1:"  // Tile loop
+    "str x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x27, #0x3\n"
+    "mov x26, #0x3\n"
+    "str x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x24, x25\n"  // offset = tile_i * ld_input_row
+    "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x24, x22\n"  // offset = tile_i * ld_output_row
+    "mov x24, #0x10\n"  // cntb _, ALL, #1
+    "madd x21, x23, x8, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x16, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "lsl x8, x8, #0x1\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "madd x20, x23, x17, x20\n"  // offset += tile_j * ld_output_col
+    "lsl x17, x17, #0x1\n"
+    "lsr x23, %x[n_channels], #0x3\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mul x21, x21, x27\n"  // offset *= kernel_stride * output_size
+    "add x16, x16, x21, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x13, x16, x25, LSL #1\n"
+    "mul x20, x20, x26\n"  // offset *= output_tile_size
+    "add x12, x13, x25, LSL #1\n"
+    "add x11, x8, x8\n"
+    "add x15, x15, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "add x10, x12, x25, LSL #1\n"
+    "add x9, x11, x8\n"
+    "add x28, x15, x22, LSL #1\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v15.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v14.8h }, [x20]\n"
+    "add x27, x10, x25, LSL #1\n"
+    "add x26, x9, x8\n"
+    "add x25, x28, x22, LSL #1\n"
+    "add x22, x17, x17\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x24\n"
+    "cbz x23, 4f\n"
+    "ldr q31, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "cmp x24, x23, LSL #4\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "add x14, x14, #0xa0\n"
+    "ldr q9, [x12, x11]\n"
+    "ld1 { v10.8h }, [x16]\n"
+    "ldr q11, [x16, x26]\n"
+    "ld1 { v12.8h }, [x27]\n"
+    "ldr q13, [x13, x11]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v29.16b, v31.16b\n fmla v29.8h, v7.8h, v9.8h\n"
+    "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+    "add x24, x24, #0x10\n"
+    "cmp x24, x23, LSL #4\n"
+    "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+    "fmla v29.8h, v4.8h, v13.8h\n"
+    "add x20, x20, #0x10\n"
+    "add x21, x21, #0x10\n"
+    "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+    "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+    "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "ldr q23, [x12, x9]\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "ldr q18, [x12, x8]\n"
+    "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v6.8h, v18.8h\n"
+    "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v5.8h, v13.8h\n"
+    "fmla v27.8h, v3.8h, v13.8h\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "fmla v25.8h, v1.8h, v13.8h\n"
+    "fmla v24.8h, v0.8h, v13.8h\n"
+    "ldr q17, [x16, x8]\n"
+    "fmla v22.8h, v6.8h, v12.8h\n"
+    "ldr q16, [x27, x26]\n"
+    "mov v20.16b, v31.16b\n fmla v20.8h, v1.8h, v9.8h\n"
+    "ldr q31, [x14, #0x0]\n"
+    "fmla v29.8h, v0.8h, v17.8h\n"
+    "fmla v21.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x16, x9]\n"
+    "fmla v28.8h, v7.8h, v18.8h\n"
+    "fmla v20.8h, v0.8h, v18.8h\n"
+    "fmla v26.8h, v4.8h, v18.8h\n"
+    "fmla v25.8h, v3.8h, v18.8h\n"
+    "fmla v22.8h, v1.8h, v18.8h\n"
+    "ld1 { v19.8h }, [x13]\n"
+    "fmla v29.8h, v2.8h, v16.8h\n"
+    "fmla v27.8h, v1.8h, v16.8h\n"
+    "ld1 { v18.8h }, [x10]\n"
+    "fmla v24.8h, v4.8h, v23.8h\n"
+    "fmla v28.8h, v1.8h, v17.8h\n"
+    "ldr q16, [x13, x26]\n"
+    "fmla v20.8h, v2.8h, v23.8h\n"
+    "fmla v21.8h, v1.8h, v23.8h\n"
+    "fmla v29.8h, v8.8h, v23.8h\n"
+    "fmla v27.8h, v7.8h, v23.8h\n"
+    "fmla v25.8h, v5.8h, v23.8h\n"
+    "ldr q17, [x10, x11]\n"
+    "fmla v26.8h, v0.8h, v19.8h\n"
+    "fmla v22.8h, v3.8h, v18.8h\n"
+    "fmla v24.8h, v2.8h, v16.8h\n"
+    "fmla v20.8h, v4.8h, v17.8h\n"
+    "fmla v21.8h, v3.8h, v17.8h\n"
+    "fmla v28.8h, v3.8h, v19.8h\n"
+    "ldr q19, [x10, x26]\n"
+    "fmla v27.8h, v5.8h, v16.8h\n"
+    "ldr q16, [x27, x8]\n"
+    "fmla v26.8h, v6.8h, v18.8h\n"
+    "ldr q18, [x13, x8]\n"
+    "fmla v25.8h, v7.8h, v17.8h\n"
+    "fmla v22.8h, v5.8h, v17.8h\n"
+    "fmla v24.8h, v6.8h, v17.8h\n"
+    "fmla v21.8h, v5.8h, v19.8h\n"
+    "fmla v20.8h, v6.8h, v16.8h\n"
+    "fmla v26.8h, v8.8h, v17.8h\n"
+    "fmla v22.8h, v7.8h, v16.8h\n"
+    "ldr q17, [x27, x9]\n"
+    "fmla v29.8h, v3.8h, v18.8h\n"
+    "fmla v25.8h, v0.8h, v18.8h\n"
+    "fmla v24.8h, v8.8h, v19.8h\n"
+    "ldr q16, [x13, x9]\n"
+    "fmla v20.8h, v8.8h, v17.8h\n"
+    "add x13, x13, #0x10\n"
+    "fmla v21.8h, v7.8h, v17.8h\n"
+    "ldr q19, [x10, x9]\n"
+    "fmla v28.8h, v4.8h, v18.8h\n"
+    "fmla v26.8h, v1.8h, v18.8h\n"
+    "ldr q17, [x10, x8]\n"
+    "fmla v29.8h, v5.8h, v16.8h\n"
+    "add x10, x10, #0x10\n"
+    "fmla v27.8h, v4.8h, v16.8h\n"
+    "fmla v25.8h, v2.8h, v16.8h\n"
+    "fmla v24.8h, v1.8h, v16.8h\n"
+    "ldr q16, [x16, x11]\n"
+    "fmla v22.8h, v4.8h, v17.8h\n"
+    "add x16, x16, #0x10\n"
+    "ld1 { v10.8h }, [x16]\n"
+    "fmla v20.8h, v3.8h, v17.8h\n"
+    "fmla v21.8h, v4.8h, v19.8h\n"
+    "ldr q4, [x14, #0x50]\n"
+    "fmla v26.8h, v7.8h, v17.8h\n"
+    "fmla v25.8h, v6.8h, v17.8h\n"
+    "ld1 { v18.8h }, [x12]\n"
+    "fmla v28.8h, v2.8h, v16.8h\n"
+    "fmla v29.8h, v1.8h, v16.8h\n"
+    "ldr q1, [x14, #0x20]\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "fmla v27.8h, v0.8h, v16.8h\n"
+    "ldr q17, [x12, x26]\n"
+    "fmla v24.8h, v7.8h, v19.8h\n"
+    "add x12, x12, #0x10\n"
+    "ldr q9, [x12, x11]\n"
+    "fmla v20.8h, v5.8h, v19.8h\n"
+    "fmla v22.8h, v0.8h, v18.8h\n"
+    "ldr q0, [x14, #0x10]\n"
+    "fmla v21.8h, v2.8h, v17.8h\n"
+    "ldr q2, [x14, #0x30]\n"
+    "fmla v25.8h, v8.8h, v19.8h\n"
+    "ldr q16, [x27, x11]\n"
+    "fmla v28.8h, v6.8h, v18.8h\n"
+    "fmla v26.8h, v3.8h, v18.8h\n"
+    "ldr q3, [x14, #0x40]\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "fmla v27.8h, v8.8h, v17.8h\n"
+    "fmla v24.8h, v5.8h, v17.8h\n"
+    "ldr q11, [x16, x26]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "fmla v22.8h, v8.8h, v16.8h\n"
+    "ldr q8, [x14, #0x90]\n"
+    "fmla v20.8h, v7.8h, v16.8h\n"
+    "ldr q7, [x14, #0x80]\n"
+    "fmla v21.8h, v6.8h, v16.8h\n"
+    "ldr q13, [x13, x11]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "add x27, x27, #0x10\n"
+    "ld1 { v12.8h }, [x27]\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "fmax v22.8h, v22.8h, v15.8h\n"
+    "add x14, x14, #0xa0\n"
+    "fmax v20.8h, v20.8h, v15.8h\n"
+    "fmax v21.8h, v21.8h, v15.8h\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "st1 { v28.8h }, [x15]\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "str q29, [x15, x17]\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "str q27, [x15, x22]\n"
+    "add x15, x15, #0x10\n"
+    "fmin v22.8h, v22.8h, v14.8h\n"
+    "fmin v20.8h, v20.8h, v14.8h\n"
+    "st1 { v26.8h }, [x28]\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "str q25, [x28, x17]\n"
+    "str q24, [x28, x22]\n"
+    "add x28, x28, #0x10\n"
+    "st1 { v22.8h }, [x25]\n"
+    "str q20, [x25, x17]\n"
+    "str q21, [x25, x22]\n"
+    "add x25, x25, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v29.16b, v31.16b\n fmla v29.8h, v7.8h, v9.8h\n"
+    "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+    "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+    "fmla v29.8h, v4.8h, v13.8h\n"
+    "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+    "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+    "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "ldr q23, [x12, x9]\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "ldr q18, [x12, x8]\n"
+    "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v6.8h, v18.8h\n"
+    "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v5.8h, v13.8h\n"
+    "fmla v27.8h, v3.8h, v13.8h\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "fmla v25.8h, v1.8h, v13.8h\n"
+    "fmla v24.8h, v0.8h, v13.8h\n"
+    "ldr q17, [x16, x8]\n"
+    "fmla v22.8h, v6.8h, v12.8h\n"
+    "ldr q16, [x27, x26]\n"
+    "mov v20.16b, v31.16b\n fmla v20.8h, v1.8h, v9.8h\n"
+    "fmla v29.8h, v0.8h, v17.8h\n"
+    "fmla v21.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x16, x9]\n"
+    "fmla v28.8h, v7.8h, v18.8h\n"
+    "fmla v20.8h, v0.8h, v18.8h\n"
+    "fmla v26.8h, v4.8h, v18.8h\n"
+    "fmla v25.8h, v3.8h, v18.8h\n"
+    "fmla v22.8h, v1.8h, v18.8h\n"
+    "ld1 { v19.8h }, [x13]\n"
+    "fmla v29.8h, v2.8h, v16.8h\n"
+    "fmla v27.8h, v1.8h, v16.8h\n"
+    "ld1 { v18.8h }, [x10]\n"
+    "fmla v24.8h, v4.8h, v23.8h\n"
+    "fmla v28.8h, v1.8h, v17.8h\n"
+    "ldr q16, [x13, x26]\n"
+    "fmla v20.8h, v2.8h, v23.8h\n"
+    "fmla v21.8h, v1.8h, v23.8h\n"
+    "fmla v29.8h, v8.8h, v23.8h\n"
+    "fmla v27.8h, v7.8h, v23.8h\n"
+    "fmla v25.8h, v5.8h, v23.8h\n"
+    "ldr q17, [x10, x11]\n"
+    "fmla v26.8h, v0.8h, v19.8h\n"
+    "fmla v22.8h, v3.8h, v18.8h\n"
+    "fmla v24.8h, v2.8h, v16.8h\n"
+    "fmla v20.8h, v4.8h, v17.8h\n"
+    "fmla v21.8h, v3.8h, v17.8h\n"
+    "fmla v28.8h, v3.8h, v19.8h\n"
+    "ldr q19, [x10, x26]\n"
+    "fmla v27.8h, v5.8h, v16.8h\n"
+    "ldr q16, [x27, x8]\n"
+    "fmla v26.8h, v6.8h, v18.8h\n"
+    "ldr q18, [x13, x8]\n"
+    "fmla v25.8h, v7.8h, v17.8h\n"
+    "fmla v22.8h, v5.8h, v17.8h\n"
+    "fmla v24.8h, v6.8h, v17.8h\n"
+    "fmla v21.8h, v5.8h, v19.8h\n"
+    "fmla v20.8h, v6.8h, v16.8h\n"
+    "fmla v26.8h, v8.8h, v17.8h\n"
+    "fmla v22.8h, v7.8h, v16.8h\n"
+    "ldr q17, [x27, x9]\n"
+    "fmla v29.8h, v3.8h, v18.8h\n"
+    "fmla v25.8h, v0.8h, v18.8h\n"
+    "fmla v24.8h, v8.8h, v19.8h\n"
+    "ldr q16, [x13, x9]\n"
+    "fmla v20.8h, v8.8h, v17.8h\n"
+    "add x13, x13, #0x10\n"
+    "fmla v21.8h, v7.8h, v17.8h\n"
+    "ldr q19, [x10, x9]\n"
+    "fmla v28.8h, v4.8h, v18.8h\n"
+    "fmla v26.8h, v1.8h, v18.8h\n"
+    "ldr q17, [x10, x8]\n"
+    "fmla v29.8h, v5.8h, v16.8h\n"
+    "add x10, x10, #0x10\n"
+    "fmla v27.8h, v4.8h, v16.8h\n"
+    "fmla v25.8h, v2.8h, v16.8h\n"
+    "fmla v24.8h, v1.8h, v16.8h\n"
+    "ldr q16, [x16, x11]\n"
+    "fmla v22.8h, v4.8h, v17.8h\n"
+    "add x16, x16, #0x10\n"
+    "fmla v20.8h, v3.8h, v17.8h\n"
+    "fmla v21.8h, v4.8h, v19.8h\n"
+    "fmla v26.8h, v7.8h, v17.8h\n"
+    "fmla v25.8h, v6.8h, v17.8h\n"
+    "ld1 { v18.8h }, [x12]\n"
+    "fmla v28.8h, v2.8h, v16.8h\n"
+    "fmla v29.8h, v1.8h, v16.8h\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "fmla v27.8h, v0.8h, v16.8h\n"
+    "ldr q17, [x12, x26]\n"
+    "fmla v24.8h, v7.8h, v19.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "fmla v20.8h, v5.8h, v19.8h\n"
+    "fmla v22.8h, v0.8h, v18.8h\n"
+    "add x12, x12, #0x10\n"
+    "fmla v21.8h, v2.8h, v17.8h\n"
+    "fmla v25.8h, v8.8h, v19.8h\n"
+    "ldr q16, [x27, x11]\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "fmla v28.8h, v6.8h, v18.8h\n"
+    "fmla v26.8h, v3.8h, v18.8h\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "add x27, x27, #0x10\n"
+    "fmla v27.8h, v8.8h, v17.8h\n"
+    "fmla v24.8h, v5.8h, v17.8h\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "fmla v22.8h, v8.8h, v16.8h\n"
+    "fmla v20.8h, v7.8h, v16.8h\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "fmla v21.8h, v6.8h, v16.8h\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "fmax v22.8h, v22.8h, v15.8h\n"
+    "fmax v20.8h, v20.8h, v15.8h\n"
+    "fmax v21.8h, v21.8h, v15.8h\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "st1 { v28.8h }, [x15]\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "str q29, [x15, x17]\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "str q27, [x15, x22]\n"
+    "add x15, x15, #0x10\n"
+    "fmin v22.8h, v22.8h, v14.8h\n"
+    "fmin v20.8h, v20.8h, v14.8h\n"
+    "st1 { v26.8h }, [x28]\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "str q25, [x28, x17]\n"
+    "str q24, [x28, x22]\n"
+    "add x28, x28, #0x10\n"
+    "st1 { v22.8h }, [x25]\n"
+    "str q20, [x25, x17]\n"
+    "str q21, [x25, x22]\n"
+    "add x25, x25, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 93f\n"
+    "ldr q31, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "add x24, x12, x11\n"
+    "add x23, x16, XZR\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "add x22, x16, x26\n"
+    "add x21, x27, XZR\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "add x20, x13, x11\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "tbz %x[n_channels], #2, 6f\n"
+    "ldr d9, [x24], #0x8\n"
+    "ldr d10, [x23], #0x8\n"
+    "ldr d11, [x22], #0x8\n"
+    "ldr d12, [x21], #0x8\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v9.s }[2], [x24], #0x4\n"
+    "ld1 { v10.s }[2], [x23], #0x4\n"
+    "ld1 { v11.s }[2], [x22], #0x4\n"
+    "ld1 { v12.s }[2], [x21], #0x4\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[6], [x24]\n"
+    "ld1 { v10.h }[6], [x23]\n"
+    "ld1 { v11.h }[6], [x22]\n"
+    "ld1 { v12.h }[6], [x21]\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 8f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[4], [x24]\n"
+    "ld1 { v10.h }[4], [x23]\n"
+    "ld1 { v11.h }[4], [x22]\n"
+    "ld1 { v12.h }[4], [x21]\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 8f\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr s9, [x24], #0x4\n"
+    "ldr s10, [x23], #0x4\n"
+    "ldr s11, [x22], #0x4\n"
+    "ldr s12, [x21], #0x4\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[2], [x24]\n"
+    "ld1 { v10.h }[2], [x23]\n"
+    "ld1 { v11.h }[2], [x22]\n"
+    "ld1 { v12.h }[2], [x21]\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x24, #0x0]\n"
+    "ldr h10, [x23, #0x0]\n"
+    "ldr h11, [x22, #0x0]\n"
+    "ldr h12, [x21, #0x0]\n"
+    "ldr h13, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: End
+    "mov v23.16b, v31.16b\n fmla v23.8h, v8.8h, v9.8h\n"
+    "mov v25.16b, v31.16b\n fmla v25.8h, v6.8h, v9.8h\n"
+    "add x20, x27, x26\n"
+    "mov v24.16b, v31.16b\n fmla v24.8h, v7.8h, v9.8h\n"
+    "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+    "mov v27.16b, v31.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+    "mov v28.16b, v31.16b\n fmla v28.8h, v3.8h, v9.8h\n"
+    "mov v29.16b, v31.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+    "fmla v23.8h, v0.8h, v10.8h\n"
+    "fmla v25.8h, v2.8h, v11.8h\n"
+    "mov v30.16b, v31.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v9.8h\n"
+    "fmla v29.8h, v6.8h, v12.8h\n"
+    "fmla v23.8h, v5.8h, v13.8h\n"
+    "fmla v24.8h, v4.8h, v13.8h\n"
+    "fmla v25.8h, v3.8h, v13.8h\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "fmla v27.8h, v1.8h, v13.8h\n"
+    "fmla v28.8h, v0.8h, v13.8h\n"
+    "tbz %x[n_channels], #2, 10f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 12f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 12f\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
+    "fmla v31.8h, v8.8h, v12.8h\n"
+    "add x20, x12, x8\n"
+    "tbz %x[n_channels], #2, 14f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 16f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 16f\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
+    "fmla v23.8h, v7.8h, v11.8h\n"
+    "fmla v24.8h, v6.8h, v11.8h\n"
+    "add x20, x16, x8\n"
+    "fmla v26.8h, v4.8h, v11.8h\n"
+    "fmla v27.8h, v3.8h, v11.8h\n"
+    "fmla v29.8h, v1.8h, v11.8h\n"
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 18f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 20f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 20f\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: End
+    "fmla v23.8h, v1.8h, v13.8h\n"
+    "fmla v24.8h, v0.8h, v13.8h\n"
+    "add x20, x16, x9\n"
+    "tbz %x[n_channels], #2, 22f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 24f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 24f\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: End
+    "fmla v24.8h, v2.8h, v12.8h\n"
+    "fmla v25.8h, v1.8h, v12.8h\n"
+    "add x20, x12, x9\n"
+    "tbz %x[n_channels], #2, 26f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 28f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 28f\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
+    "fmla v24.8h, v8.8h, v10.8h\n"
+    "fmla v25.8h, v7.8h, v10.8h\n"
+    "add x20, x13, XZR\n"
+    "fmla v27.8h, v5.8h, v10.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "fmla v30.8h, v2.8h, v10.8h\n"
+    "fmla v31.8h, v1.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 30f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 32f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 32f\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: End
+    "fmla v23.8h, v3.8h, v11.8h\n"
+    "fmla v26.8h, v0.8h, v11.8h\n"
+    "add x20, x13, x26\n"
+    "tbz %x[n_channels], #2, 34f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 36f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 36f\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
+    "fmla v25.8h, v5.8h, v13.8h\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "add x20, x10, XZR\n"
+    "tbz %x[n_channels], #2, 38f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 40f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 40f\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
+    "fmla v26.8h, v6.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "add x20, x10, x11\n"
+    "tbz %x[n_channels], #2, 42f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 44f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 44f\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
+    "fmla v26.8h, v8.8h, v10.8h\n"
+    "fmla v27.8h, v7.8h, v10.8h\n"
+    "add x20, x10, x26\n"
+    "fmla v28.8h, v6.8h, v10.8h\n"
+    "fmla v29.8h, v5.8h, v10.8h\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v31.8h, v3.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 46f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 48f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 48f\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
+    "fmla v28.8h, v8.8h, v11.8h\n"
+    "fmla v31.8h, v5.8h, v11.8h\n"
+    "add x20, x27, x8\n"
+    "tbz %x[n_channels], #2, 50f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 52f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 52f\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
+    "fmla v29.8h, v7.8h, v13.8h\n"
+    "fmla v30.8h, v6.8h, v13.8h\n"
+    "add x20, x13, x8\n"
+    "tbz %x[n_channels], #2, 54f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 56f\n"
+    "53:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 56f\n"
+    "54:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 55f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "56:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: End
+    "fmla v23.8h, v4.8h, v12.8h\n"
+    "fmla v24.8h, v3.8h, v12.8h\n"
+    "add x20, x13, x9\n"
+    "fmla v26.8h, v1.8h, v12.8h\n"
+    "fmla v27.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 58f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 57f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 60f\n"
+    "57:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 60f\n"
+    "58:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 59f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 60f\n"
+    "59:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "60:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
+    "fmla v24.8h, v5.8h, v11.8h\n"
+    "fmla v25.8h, v4.8h, v11.8h\n"
+    "add x20, x27, x9\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 62f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 61f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 64f\n"
+    "61:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 64f\n"
+    "62:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 63f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 64f\n"
+    "63:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "64:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
+    "fmla v30.8h, v8.8h, v13.8h\n"
+    "fmla v31.8h, v7.8h, v13.8h\n"
+    "add x20, x10, x8\n"
+    "tbz %x[n_channels], #2, 66f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 65f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 68f\n"
+    "65:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 68f\n"
+    "66:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 67f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 68f\n"
+    "67:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "68:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
+    "fmla v26.8h, v7.8h, v12.8h\n"
+    "fmla v27.8h, v6.8h, v12.8h\n"
+    "add x20, x16, x11\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 70f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 69f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 72f\n"
+    "69:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 72f\n"
+    "70:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 71f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 72f\n"
+    "71:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "72:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: End
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "fmla v24.8h, v1.8h, v11.8h\n"
+    "add x20, x10, x9\n"
+    "fmla v25.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 74f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 73f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 76f\n"
+    "73:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 76f\n"
+    "74:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 75f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 76f\n"
+    "75:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "76:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
+    "fmla v27.8h, v8.8h, v13.8h\n"
+    "fmla v28.8h, v7.8h, v13.8h\n"
+    "add x20, x12, XZR\n"
+    "fmla v30.8h, v5.8h, v13.8h\n"
+    "fmla v31.8h, v4.8h, v13.8h\n"
+    "tbz %x[n_channels], #2, 78f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 77f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 80f\n"
+    "77:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 80f\n"
+    "78:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 79f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 80f\n"
+    "79:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "80:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
+    "fmla v23.8h, v6.8h, v12.8h\n"
+    "fmla v26.8h, v3.8h, v12.8h\n"
+    "add x20, x12, x26\n"
+    "fmla v29.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 82f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 81f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 84f\n"
+    "81:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 84f\n"
+    "82:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 83f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 84f\n"
+    "83:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "84:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
+    "fmla v25.8h, v8.8h, v11.8h\n"
+    "fmla v28.8h, v5.8h, v11.8h\n"
+    "add x20, x27, x11\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 86f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 85f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 88f\n"
+    "85:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 88f\n"
+    "86:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 87f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 88f\n"
+    "87:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "88:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
+    "fmla v29.8h, v8.8h, v13.8h\n"
+    "fmla v30.8h, v7.8h, v13.8h\n"
+    "fmax v23.8h, v23.8h, v15.8h\n"
+    "fmla v31.8h, v6.8h, v13.8h\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "fmax v30.8h, v30.8h, v15.8h\n"
+    "fmax v31.8h, v31.8h, v15.8h\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "fmin v30.8h, v30.8h, v14.8h\n"
+    "fmin v31.8h, v31.8h, v14.8h\n"
+    "tbz %x[n_channels], #2, 90f\n"
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.d }[0], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.d }[0], [x21], x17\n"
+    "add x15, x15, #0x8\n"
+    "st1 { v29.d }[0], [x20], x17\n"
+    "add x28, x28, #0x8\n"
+    "add x25, x25, #0x8\n"
+    "st1 { v24.d }[0], [x22], x17\n"
+    "st1 { v27.d }[0], [x21], x17\n"
+    "st1 { v30.d }[0], [x20], x17\n"
+    "st1 { v25.d }[0], [x22]\n"
+    "st1 { v28.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #1, 89f\n"
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.s }[2], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.s }[2], [x21], x17\n"
+    "add x15, x15, #0x4\n"
+    "st1 { v29.s }[2], [x20], x17\n"
+    "add x28, x28, #0x4\n"
+    "add x25, x25, #0x4\n"
+    "st1 { v24.s }[2], [x22], x17\n"
+    "st1 { v27.s }[2], [x21], x17\n"
+    "st1 { v30.s }[2], [x20], x17\n"
+    "st1 { v25.s }[2], [x22]\n"
+    "st1 { v28.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "tbz %x[n_channels], #0, 92f\n"
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.h }[6], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.h }[6], [x21], x17\n"
+    "st1 { v29.h }[6], [x20], x17\n"
+    "st1 { v24.h }[6], [x22], x17\n"
+    "st1 { v27.h }[6], [x21], x17\n"
+    "st1 { v30.h }[6], [x20], x17\n"
+    "st1 { v25.h }[6], [x22]\n"
+    "st1 { v28.h }[6], [x21]\n"
+    "st1 { v31.h }[6], [x20]\n"
+    "b 92f\n"
+    "89:"  // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 92f\n"
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.h }[4], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.h }[4], [x21], x17\n"
+    "st1 { v29.h }[4], [x20], x17\n"
+    "st1 { v24.h }[4], [x22], x17\n"
+    "st1 { v27.h }[4], [x21], x17\n"
+    "st1 { v30.h }[4], [x20], x17\n"
+    "st1 { v25.h }[4], [x22]\n"
+    "st1 { v28.h }[4], [x21]\n"
+    "st1 { v31.h }[4], [x20]\n"
+    "b 92f\n"
+    "90:"  // Tile loop: Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 91f\n"
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.s }[0], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.s }[0], [x21], x17\n"
+    "add x15, x15, #0x4\n"
+    "st1 { v29.s }[0], [x20], x17\n"
+    "add x28, x28, #0x4\n"
+    "add x25, x25, #0x4\n"
+    "st1 { v24.s }[0], [x22], x17\n"
+    "st1 { v27.s }[0], [x21], x17\n"
+    "st1 { v30.s }[0], [x20], x17\n"
+    "st1 { v25.s }[0], [x22]\n"
+    "st1 { v28.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 92f\n"
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.h }[2], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.h }[2], [x21], x17\n"
+    "st1 { v29.h }[2], [x20], x17\n"
+    "st1 { v24.h }[2], [x22], x17\n"
+    "st1 { v27.h }[2], [x21], x17\n"
+    "st1 { v30.h }[2], [x20], x17\n"
+    "st1 { v25.h }[2], [x22]\n"
+    "st1 { v28.h }[2], [x21]\n"
+    "st1 { v31.h }[2], [x20]\n"
+    "b 92f\n"
+    "91:"  // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.h }[0], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.h }[0], [x21], x17\n"
+    "st1 { v29.h }[0], [x20], x17\n"
+    "st1 { v24.h }[0], [x22], x17\n"
+    "st1 { v27.h }[0], [x21], x17\n"
+    "st1 { v30.h }[0], [x20], x17\n"
+    "st1 { v25.h }[0], [x22]\n"
+    "st1 { v28.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "92:"  // Tile loop: Oddments: Store: Bit 2: End
+    "93:"  // Tile loop: End
+    "ldr x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x23, x23, #0x1\n"
+    "add x21, x24, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x23, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x24, x24, x21, LT\n"
+    "csel x23, x23, XZR, LT\n"
+    "cmp x24, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..72e68482c6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1291 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[25];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[4];
+      inptrs[3] = input_ptrs[20];
+      inptrs[4] = input_ptrs[7];
+      inptrs[5] = input_ptrs[24];
+      inptrs[6] = input_ptrs[11];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[3];
+      inptrs[9] = input_ptrs[13];
+      inptrs[10] = input_ptrs[5];
+      inptrs[11] = input_ptrs[9];
+      inptrs[12] = input_ptrs[15];
+      inptrs[13] = input_ptrs[17];
+      inptrs[14] = input_ptrs[19];
+      inptrs[15] = input_ptrs[21];
+      inptrs[16] = input_ptrs[6];
+      inptrs[17] = input_ptrs[8];
+      inptrs[18] = input_ptrs[23];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[18];
+      inptrs[22] = input_ptrs[10];
+      inptrs[23] = input_ptrs[14];
+      inptrs[24] = input_ptrs[22];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "mov x7, #0x10\n"  // cntb _, ALL, #1
+    "lsr x8, %x[n_channels], #0x3\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v15.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v14.8h }, [x20]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "mov x14, #0x0\n"
+    "sub x13, XZR, x7\n"
+    "cbz x8, 3f\n"
+    "ldr q31, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "cmp x7, x8, LSL #4\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "ldr q5, [x16, #0x60]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "ldr q7, [x16, #0x80]\n"
+    "ldr q8, [x16, #0x90]\n"
+    "add x16, x16, #0xa0\n"
+    "ldp x21, x20, [x15, #0x0]\n"
+    "ldr q9, [x21, x14]\n"
+    "ldr q10, [x20, x14]\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr q11, [x21, x14]\n"
+    "ldr q12, [x20, x14]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ldr q13, [x20, x14]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+    "mov v28.16b, v31.16b\n fmla v28.8h, v7.8h, v9.8h\n"
+    "ldr x26, [x15, #0x30]\n"
+    "ldr x23, [x15, #0x38]\n"
+    "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+    "fmla v29.8h, v0.8h, v10.8h\n"
+    "ldr x22, [x15, #0x28]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "ldr q19, [x20, x14]\n"
+    "fmla v28.8h, v4.8h, v13.8h\n"
+    "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+    "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+    "ldr x25, [x15, #0x50]\n"
+    "ldr x24, [x15, #0x58]\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "ldr q17, [x26, x14]\n"
+    "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+    "ldr x20, [x15, #0x60]\n"
+    "fmla v29.8h, v5.8h, v13.8h\n"
+    "fmla v28.8h, v6.8h, v17.8h\n"
+    "ldr x12, [x15, #0x70]\n"
+    "ldr x11, [x15, #0x88]\n"
+    "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
+    "fmla v27.8h, v3.8h, v13.8h\n"
+    "ldr x10, [x17, #0x0]\n"
+    "add x13, x13, #0x10\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "fmla v25.8h, v1.8h, v13.8h\n"
+    "ldr x9, [x17, #0x8]\n"
+    "ldr x28, [x17, #0x10]\n"
+    "fmla v24.8h, v0.8h, v13.8h\n"
+    "ldr q18, [x23, x14]\n"
+    "fmla v23.8h, v6.8h, v12.8h\n"
+    "ldr q16, [x22, x14]\n"
+    "mov v21.16b, v31.16b\n fmla v21.8h, v1.8h, v9.8h\n"
+    "ldr q31, [x16, #0x0]\n"
+    "fmla v29.8h, v7.8h, v17.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "fmla v28.8h, v0.8h, v18.8h\n"
+    "fmla v22.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x21, x14]\n"
+    "ldr x22, [x15, #0x78]\n"
+    "fmla v26.8h, v4.8h, v17.8h\n"
+    "fmla v25.8h, v3.8h, v17.8h\n"
+    "ldr x21, [x15, #0x80]\n"
+    "ldr x27, [x17, #0x18]\n"
+    "fmla v21.8h, v0.8h, v17.8h\n"
+    "fmla v24.8h, v4.8h, v19.8h\n"
+    "fmla v23.8h, v1.8h, v17.8h\n"
+    "ldr q17, [x25, x14]\n"
+    "fmla v29.8h, v1.8h, v18.8h\n"
+    "ldr q20, [x24, x14]\n"
+    "fmla v28.8h, v2.8h, v16.8h\n"
+    "fmla v27.8h, v1.8h, v16.8h\n"
+    "ldr q16, [x20, x14]\n"
+    "ldr x26, [x15, #0x90]\n"
+    "fmla v25.8h, v5.8h, v19.8h\n"
+    "fmla v21.8h, v2.8h, v19.8h\n"
+    "ldr x25, [x15, #0xa0]\n"
+    "ldr x20, [x15, #0x98]\n"
+    "fmla v26.8h, v0.8h, v17.8h\n"
+    "fmla v24.8h, v2.8h, v20.8h\n"
+    "fmla v28.8h, v8.8h, v19.8h\n"
+    "fmla v27.8h, v7.8h, v19.8h\n"
+    "fmla v22.8h, v1.8h, v19.8h\n"
+    "ldr q19, [x23, x14]\n"
+    "fmla v23.8h, v3.8h, v16.8h\n"
+    "ldr x24, [x15, #0xa8]\n"
+    "fmla v26.8h, v6.8h, v16.8h\n"
+    "ldr q18, [x21, x14]\n"
+    "fmla v25.8h, v7.8h, v19.8h\n"
+    "ldr x23, [x15, #0xc0]\n"
+    "fmla v24.8h, v6.8h, v19.8h\n"
+    "fmla v21.8h, v4.8h, v19.8h\n"
+    "fmla v29.8h, v3.8h, v17.8h\n"
+    "ldr q17, [x12, x14]\n"
+    "fmla v27.8h, v5.8h, v20.8h\n"
+    "ldr q16, [x22, x14]\n"
+    "fmla v23.8h, v5.8h, v19.8h\n"
+    "fmla v22.8h, v3.8h, v19.8h\n"
+    "ldr x22, [x15, #0xb0]\n"
+    "ldr x21, [x15, #0xb8]\n"
+    "fmla v26.8h, v8.8h, v19.8h\n"
+    "fmla v24.8h, v8.8h, v17.8h\n"
+    "fmla v21.8h, v6.8h, v16.8h\n"
+    "fmla v28.8h, v3.8h, v18.8h\n"
+    "fmla v25.8h, v0.8h, v18.8h\n"
+    "fmla v22.8h, v5.8h, v17.8h\n"
+    "ldr q17, [x11, x14]\n"
+    "fmla v23.8h, v7.8h, v16.8h\n"
+    "ldr q16, [x26, x14]\n"
+    "fmla v29.8h, v4.8h, v18.8h\n"
+    "fmla v26.8h, v1.8h, v18.8h\n"
+    "ldr q18, [x20, x14]\n"
+    "fmla v28.8h, v5.8h, v17.8h\n"
+    "fmla v27.8h, v4.8h, v17.8h\n"
+    "fmla v25.8h, v2.8h, v17.8h\n"
+    "fmla v24.8h, v1.8h, v17.8h\n"
+    "ldr q17, [x25, x14]\n"
+    "fmla v21.8h, v8.8h, v16.8h\n"
+    "ldr x20, [x15, #0x20]\n"
+    "fmla v22.8h, v7.8h, v16.8h\n"
+    "ldr q16, [x24, x14]\n"
+    "fmla v29.8h, v2.8h, v17.8h\n"
+    "fmla v26.8h, v7.8h, v18.8h\n"
+    "fmla v25.8h, v6.8h, v18.8h\n"
+    "fmla v23.8h, v4.8h, v18.8h\n"
+    "fmla v21.8h, v3.8h, v18.8h\n"
+    "ldr q18, [x22, x14]\n"
+    "fmla v22.8h, v4.8h, v16.8h\n"
+    "ldr q4, [x16, #0x50]\n"
+    "fmla v28.8h, v1.8h, v17.8h\n"
+    "ldr q1, [x16, #0x20]\n"
+    "fmla v27.8h, v0.8h, v17.8h\n"
+    "ldr q17, [x21, x14]\n"
+    "fmla v29.8h, v6.8h, v18.8h\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "fmla v24.8h, v7.8h, v16.8h\n"
+    "fmla v21.8h, v5.8h, v16.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "str q29, [x10, x13]\n"
+    "fmla v23.8h, v0.8h, v18.8h\n"
+    "ldr q0, [x16, #0x10]\n"
+    "fmla v22.8h, v2.8h, v17.8h\n"
+    "ldr q2, [x16, #0x30]\n"
+    "fmla v25.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x23, x14]\n"
+    "fmla v26.8h, v3.8h, v18.8h\n"
+    "ldr q3, [x16, #0x40]\n"
+    "fmla v27.8h, v8.8h, v17.8h\n"
+    "fmla v24.8h, v5.8h, v17.8h\n"
+    "ldr q5, [x16, #0x60]\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "fmla v23.8h, v8.8h, v16.8h\n"
+    "ldr q8, [x16, #0x90]\n"
+    "fmla v21.8h, v7.8h, v16.8h\n"
+    "ldr q7, [x16, #0x80]\n"
+    "fmla v22.8h, v6.8h, v16.8h\n"
+    "ldr q13, [x20, x7]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "ldr x24, [x17, #0x20]\n"
+    "ldp x21, x20, [x15, #0x0]\n"
+    "ldr q9, [x21, x7]\n"
+    "ldr q10, [x20, x7]\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr q11, [x21, x7]\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "ldr q12, [x20, x7]\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "fmax v23.8h, v23.8h, v15.8h\n"
+    "str q28, [x9, x13]\n"
+    "fmax v21.8h, v21.8h, v15.8h\n"
+    "fmax v22.8h, v22.8h, v15.8h\n"
+    "str q27, [x28, x13]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "str q26, [x27, x13]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x7, x7, #0x10\n"
+    "str q25, [x24, x13]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "cmp x7, x8, LSL #4\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "add x14, x14, #0x10\n"
+    "str q24, [x23, x13]\n"
+    "fmin v22.8h, v22.8h, v14.8h\n"
+    "str q23, [x22, x13]\n"
+    "add x16, x16, #0xa0\n"
+    "str q21, [x21, x13]\n"
+    "str q22, [x20, x13]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+    "mov v28.16b, v31.16b\n fmla v28.8h, v7.8h, v9.8h\n"
+    "ldr x23, [x15, #0x30]\n"
+    "ldr x22, [x15, #0x38]\n"
+    "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+    "fmla v29.8h, v0.8h, v10.8h\n"
+    "ldr x21, [x15, #0x28]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "ldr q19, [x20, x14]\n"
+    "fmla v28.8h, v4.8h, v13.8h\n"
+    "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+    "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+    "ldr x25, [x15, #0x50]\n"
+    "ldr x24, [x15, #0x58]\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "ldr q17, [x23, x14]\n"
+    "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+    "ldr x23, [x15, #0x60]\n"
+    "fmla v29.8h, v5.8h, v13.8h\n"
+    "fmla v28.8h, v6.8h, v17.8h\n"
+    "ldr x12, [x15, #0x70]\n"
+    "ldr x11, [x15, #0x88]\n"
+    "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
+    "fmla v27.8h, v3.8h, v13.8h\n"
+    "ldr x10, [x17, #0x0]\n"
+    "add x13, x13, #0x10\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "fmla v25.8h, v1.8h, v13.8h\n"
+    "ldr x9, [x17, #0x8]\n"
+    "ldr x28, [x17, #0x10]\n"
+    "fmla v24.8h, v0.8h, v13.8h\n"
+    "ldr q18, [x22, x14]\n"
+    "fmla v23.8h, v6.8h, v12.8h\n"
+    "ldr q16, [x21, x14]\n"
+    "mov v21.16b, v31.16b\n fmla v21.8h, v1.8h, v9.8h\n"
+    "fmla v29.8h, v7.8h, v17.8h\n"
+    "ldr x22, [x15, #0x68]\n"
+    "ldr x21, [x15, #0x78]\n"
+    "fmla v28.8h, v0.8h, v18.8h\n"
+    "fmla v22.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x20, x14]\n"
+    "ldr x20, [x15, #0x80]\n"
+    "fmla v26.8h, v4.8h, v17.8h\n"
+    "fmla v25.8h, v3.8h, v17.8h\n"
+    "ldr x27, [x17, #0x18]\n"
+    "fmla v21.8h, v0.8h, v17.8h\n"
+    "fmla v24.8h, v4.8h, v19.8h\n"
+    "fmla v23.8h, v1.8h, v17.8h\n"
+    "ldr q17, [x25, x14]\n"
+    "fmla v29.8h, v1.8h, v18.8h\n"
+    "ldr q20, [x24, x14]\n"
+    "fmla v28.8h, v2.8h, v16.8h\n"
+    "fmla v27.8h, v1.8h, v16.8h\n"
+    "ldr q16, [x23, x14]\n"
+    "ldr x26, [x15, #0x90]\n"
+    "fmla v25.8h, v5.8h, v19.8h\n"
+    "fmla v21.8h, v2.8h, v19.8h\n"
+    "ldr x25, [x15, #0xa0]\n"
+    "ldr x24, [x15, #0x98]\n"
+    "fmla v26.8h, v0.8h, v17.8h\n"
+    "fmla v24.8h, v2.8h, v20.8h\n"
+    "fmla v28.8h, v8.8h, v19.8h\n"
+    "fmla v27.8h, v7.8h, v19.8h\n"
+    "fmla v22.8h, v1.8h, v19.8h\n"
+    "ldr q19, [x22, x14]\n"
+    "fmla v23.8h, v3.8h, v16.8h\n"
+    "ldr x23, [x15, #0xa8]\n"
+    "fmla v26.8h, v6.8h, v16.8h\n"
+    "ldr q18, [x20, x14]\n"
+    "fmla v25.8h, v7.8h, v19.8h\n"
+    "ldr x22, [x15, #0xc0]\n"
+    "fmla v24.8h, v6.8h, v19.8h\n"
+    "fmla v21.8h, v4.8h, v19.8h\n"
+    "fmla v29.8h, v3.8h, v17.8h\n"
+    "ldr q17, [x12, x14]\n"
+    "fmla v27.8h, v5.8h, v20.8h\n"
+    "ldr q16, [x21, x14]\n"
+    "fmla v23.8h, v5.8h, v19.8h\n"
+    "fmla v22.8h, v3.8h, v19.8h\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v26.8h, v8.8h, v19.8h\n"
+    "fmla v24.8h, v8.8h, v17.8h\n"
+    "fmla v21.8h, v6.8h, v16.8h\n"
+    "fmla v28.8h, v3.8h, v18.8h\n"
+    "fmla v25.8h, v0.8h, v18.8h\n"
+    "fmla v22.8h, v5.8h, v17.8h\n"
+    "ldr q17, [x11, x14]\n"
+    "fmla v23.8h, v7.8h, v16.8h\n"
+    "ldr q16, [x26, x14]\n"
+    "fmla v29.8h, v4.8h, v18.8h\n"
+    "fmla v26.8h, v1.8h, v18.8h\n"
+    "ldr q18, [x24, x14]\n"
+    "fmla v28.8h, v5.8h, v17.8h\n"
+    "fmla v27.8h, v4.8h, v17.8h\n"
+    "fmla v25.8h, v2.8h, v17.8h\n"
+    "fmla v24.8h, v1.8h, v17.8h\n"
+    "ldr q17, [x25, x14]\n"
+    "fmla v21.8h, v8.8h, v16.8h\n"
+    "fmla v22.8h, v7.8h, v16.8h\n"
+    "ldr q16, [x23, x14]\n"
+    "fmla v29.8h, v2.8h, v17.8h\n"
+    "fmla v26.8h, v7.8h, v18.8h\n"
+    "fmla v25.8h, v6.8h, v18.8h\n"
+    "fmla v23.8h, v4.8h, v18.8h\n"
+    "fmla v21.8h, v3.8h, v18.8h\n"
+    "ldr q18, [x21, x14]\n"
+    "fmla v22.8h, v4.8h, v16.8h\n"
+    "fmla v28.8h, v1.8h, v17.8h\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "fmla v27.8h, v0.8h, v17.8h\n"
+    "ldr q17, [x20, x14]\n"
+    "fmla v29.8h, v6.8h, v18.8h\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "fmla v24.8h, v7.8h, v16.8h\n"
+    "fmla v21.8h, v5.8h, v16.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "str q29, [x10, x13]\n"
+    "fmla v23.8h, v0.8h, v18.8h\n"
+    "fmla v22.8h, v2.8h, v17.8h\n"
+    "ldr x20, [x17, #0x20]\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "fmla v25.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x22, x14]\n"
+    "fmla v26.8h, v3.8h, v18.8h\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "fmla v27.8h, v8.8h, v17.8h\n"
+    "fmla v24.8h, v5.8h, v17.8h\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "str q28, [x9, x13]\n"
+    "fmla v23.8h, v8.8h, v16.8h\n"
+    "fmla v21.8h, v7.8h, v16.8h\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "ldr x23, [x17, #0x28]\n"
+    "fmla v22.8h, v6.8h, v16.8h\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "str q27, [x28, x13]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "str q26, [x27, x13]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "fmax v23.8h, v23.8h, v15.8h\n"
+    "str q25, [x20, x13]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "fmax v21.8h, v21.8h, v15.8h\n"
+    "fmax v22.8h, v22.8h, v15.8h\n"
+    "add x14, x14, #0x10\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "str q24, [x23, x13]\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "fmin v22.8h, v22.8h, v14.8h\n"
+    "str q23, [x22, x13]\n"
+    "str q21, [x21, x13]\n"
+    "str q22, [x20, x13]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 92f\n"
+    "ldr q31, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "mov x13, x14\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "ldr q5, [x16, #0x60]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "ldr q7, [x16, #0x80]\n"
+    "ldr q8, [x16, #0x90]\n"
+    "ldr x24, [x15, #0x0]\n"
+    "ldr x23, [x15, #0x8]\n"
+    "add x24, x24, x14\n"
+    "add x23, x23, x14\n"
+    "ldr x22, [x15, #0x10]\n"
+    "ldr x21, [x15, #0x18]\n"
+    "add x22, x22, x14\n"
+    "add x21, x21, x14\n"
+    "ldr x20, [x15, #0x20]\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 5f\n"
+    "ld1 { v9.d }[0], [x24], #0x8\n"
+    "ld1 { v10.d }[0], [x23], #0x8\n"
+    "ld1 { v11.d }[0], [x22], #0x8\n"
+    "ld1 { v12.d }[0], [x21], #0x8\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.s }[2], [x24], #0x4\n"
+    "ld1 { v10.s }[2], [x23], #0x4\n"
+    "ld1 { v11.s }[2], [x22], #0x4\n"
+    "ld1 { v12.s }[2], [x21], #0x4\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[6], [x24], #0x2\n"
+    "ld1 { v10.h }[6], [x23], #0x2\n"
+    "ld1 { v11.h }[6], [x22], #0x2\n"
+    "ld1 { v12.h }[6], [x21], #0x2\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[4], [x24], #0x2\n"
+    "ld1 { v10.h }[4], [x23], #0x2\n"
+    "ld1 { v11.h }[4], [x22], #0x2\n"
+    "ld1 { v12.h }[4], [x21], #0x2\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v9.s }[0], [x24], #0x4\n"
+    "ld1 { v10.s }[0], [x23], #0x4\n"
+    "ld1 { v11.s }[0], [x22], #0x4\n"
+    "ld1 { v12.s }[0], [x21], #0x4\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[2], [x24], #0x2\n"
+    "ld1 { v10.h }[2], [x23], #0x2\n"
+    "ld1 { v11.h }[2], [x22], #0x2\n"
+    "ld1 { v12.h }[2], [x21], #0x2\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x24], #0x2\n"
+    "ld1 { v10.h }[0], [x23], #0x2\n"
+    "ld1 { v11.h }[0], [x22], #0x2\n"
+    "ld1 { v12.h }[0], [x21], #0x2\n"
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "7:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: End
+    "mov v23.16b, v31.16b\n fmla v23.8h, v8.8h, v9.8h\n"
+    "mov v25.16b, v31.16b\n fmla v25.8h, v6.8h, v9.8h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "add x20, x20, x14\n"
+    "mov v24.16b, v31.16b\n fmla v24.8h, v7.8h, v9.8h\n"
+    "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+    "mov v27.16b, v31.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+    "mov v28.16b, v31.16b\n fmla v28.8h, v3.8h, v9.8h\n"
+    "mov v29.16b, v31.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+    "fmla v23.8h, v0.8h, v10.8h\n"
+    "fmla v25.8h, v2.8h, v11.8h\n"
+    "mov v30.16b, v31.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v9.8h\n"
+    "fmla v29.8h, v6.8h, v12.8h\n"
+    "fmla v23.8h, v5.8h, v13.8h\n"
+    "fmla v24.8h, v4.8h, v13.8h\n"
+    "fmla v25.8h, v3.8h, v13.8h\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "fmla v27.8h, v1.8h, v13.8h\n"
+    "fmla v28.8h, v0.8h, v13.8h\n"
+    "tbz %x[n_channels], #2, 9f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 11f\n"
+    "8:"  // Oddments: Load input (4, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 11f\n"
+    "9:"  // Oddments: Load input (4, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "11:"  // Oddments: Load input (4, 4): Bit 2: End
+    "ldr x20, [x15, #0x30]\n"
+    "fmla v31.8h, v8.8h, v12.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 13f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load input (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load input (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "15:"  // Oddments: Load input (2, 1): Bit 2: End
+    "ldr x20, [x15, #0x38]\n"
+    "fmla v23.8h, v7.8h, v11.8h\n"
+    "fmla v24.8h, v6.8h, v11.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v26.8h, v4.8h, v11.8h\n"
+    "fmla v27.8h, v3.8h, v11.8h\n"
+    "fmla v29.8h, v1.8h, v11.8h\n"
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 17f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load input (0, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load input (0, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (0, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "19:"  // Oddments: Load input (0, 1): Bit 2: End
+    "ldr x20, [x15, #0x40]\n"
+    "fmla v23.8h, v1.8h, v13.8h\n"
+    "fmla v24.8h, v0.8h, v13.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 21f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load input (0, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load input (0, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (0, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "23:"  // Oddments: Load input (0, 3): Bit 2: End
+    "ldr x20, [x15, #0x48]\n"
+    "fmla v24.8h, v2.8h, v12.8h\n"
+    "fmla v25.8h, v1.8h, v12.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 25f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load input (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load input (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "27:"  // Oddments: Load input (2, 3): Bit 2: End
+    "ldr x20, [x15, #0x50]\n"
+    "fmla v24.8h, v8.8h, v10.8h\n"
+    "fmla v25.8h, v7.8h, v10.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v27.8h, v5.8h, v10.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "fmla v30.8h, v2.8h, v10.8h\n"
+    "fmla v31.8h, v1.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 29f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load input (1, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load input (1, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (1, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "31:"  // Oddments: Load input (1, 0): Bit 2: End
+    "ldr x20, [x15, #0x58]\n"
+    "fmla v23.8h, v3.8h, v11.8h\n"
+    "fmla v26.8h, v0.8h, v11.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 33f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load input (1, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load input (1, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "35:"  // Oddments: Load input (1, 4): Bit 2: End
+    "ldr x20, [x15, #0x60]\n"
+    "fmla v25.8h, v5.8h, v13.8h\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 37f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load input (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "39:"  // Oddments: Load input (3, 0): Bit 2: End
+    "ldr x20, [x15, #0x68]\n"
+    "fmla v26.8h, v6.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 41f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load input (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "43:"  // Oddments: Load input (3, 2): Bit 2: End
+    "ldr x20, [x15, #0x70]\n"
+    "fmla v26.8h, v8.8h, v10.8h\n"
+    "fmla v27.8h, v7.8h, v10.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v28.8h, v6.8h, v10.8h\n"
+    "fmla v29.8h, v5.8h, v10.8h\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v31.8h, v3.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 45f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load input (3, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load input (3, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "47:"  // Oddments: Load input (3, 4): Bit 2: End
+    "ldr x20, [x15, #0x78]\n"
+    "fmla v28.8h, v8.8h, v11.8h\n"
+    "fmla v31.8h, v5.8h, v11.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 49f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load input (4, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load input (4, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "51:"  // Oddments: Load input (4, 1): Bit 2: End
+    "ldr x20, [x15, #0x80]\n"
+    "fmla v29.8h, v7.8h, v13.8h\n"
+    "fmla v30.8h, v6.8h, v13.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 53f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load input (1, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load input (1, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 54f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load input (1, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "55:"  // Oddments: Load input (1, 1): Bit 2: End
+    "ldr x20, [x15, #0x88]\n"
+    "fmla v23.8h, v4.8h, v12.8h\n"
+    "fmla v24.8h, v3.8h, v12.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v26.8h, v1.8h, v12.8h\n"
+    "fmla v27.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 57f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 56f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load input (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 58f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "59:"  // Oddments: Load input (1, 3): Bit 2: End
+    "ldr x20, [x15, #0x90]\n"
+    "fmla v24.8h, v5.8h, v11.8h\n"
+    "fmla v25.8h, v4.8h, v11.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 61f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 60f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load input (4, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load input (4, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 62f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "63:"  // Oddments: Load input (4, 3): Bit 2: End
+    "ldr x20, [x15, #0x98]\n"
+    "fmla v30.8h, v8.8h, v13.8h\n"
+    "fmla v31.8h, v7.8h, v13.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 65f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 64f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load input (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 66f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "67:"  // Oddments: Load input (3, 1): Bit 2: End
+    "ldr x20, [x15, #0xa0]\n"
+    "fmla v26.8h, v7.8h, v12.8h\n"
+    "fmla v27.8h, v6.8h, v12.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 69f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 68f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load input (0, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load input (0, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 70f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load input (0, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "71:"  // Oddments: Load input (0, 2): Bit 2: End
+    "ldr x20, [x15, #0xa8]\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "fmla v24.8h, v1.8h, v11.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v25.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 73f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 72f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load input (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 74f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "75:"  // Oddments: Load input (3, 3): Bit 2: End
+    "ldr x20, [x15, #0xb0]\n"
+    "fmla v27.8h, v8.8h, v13.8h\n"
+    "fmla v28.8h, v7.8h, v13.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v30.8h, v5.8h, v13.8h\n"
+    "fmla v31.8h, v4.8h, v13.8h\n"
+    "tbz %x[n_channels], #2, 77f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 76f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load input (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load input (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 78f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "79:"  // Oddments: Load input (2, 0): Bit 2: End
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v23.8h, v6.8h, v12.8h\n"
+    "fmla v26.8h, v3.8h, v12.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v29.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 81f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 80f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load input (2, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load input (2, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 82f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "83:"  // Oddments: Load input (2, 4): Bit 2: End
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla v25.8h, v8.8h, v11.8h\n"
+    "fmla v28.8h, v5.8h, v11.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 85f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 84f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 87f\n"
+    "84:"  // Oddments: Load input (4, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 87f\n"
+    "85:"  // Oddments: Load input (4, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 86f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 87f\n"
+    "86:"  // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "87:"  // Oddments: Load input (4, 2): Bit 2: End
+    "fmla v29.8h, v8.8h, v13.8h\n"
+    "fmla v30.8h, v7.8h, v13.8h\n"
+    "fmax v23.8h, v23.8h, v15.8h\n"
+    "fmla v31.8h, v6.8h, v13.8h\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "fmax v30.8h, v30.8h, v15.8h\n"
+    "fmax v31.8h, v31.8h, v15.8h\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "fmin v30.8h, v30.8h, v14.8h\n"
+    "fmin v31.8h, v31.8h, v14.8h\n"
+    "tbz %x[n_channels], #2, 89f\n"
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.d }[0], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.d }[0], [x23]\n"
+    "st1 { v25.d }[0], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.d }[0], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.d }[0], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "add x13, x13, #0x8\n"
+    "st1 { v28.d }[0], [x23]\n"
+    "st1 { v29.d }[0], [x22]\n"
+    "st1 { v30.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #1, 88f\n"
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.s }[2], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.s }[2], [x23]\n"
+    "st1 { v25.s }[2], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.s }[2], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.s }[2], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "add x13, x13, #0x4\n"
+    "st1 { v28.s }[2], [x23]\n"
+    "st1 { v29.s }[2], [x22]\n"
+    "st1 { v30.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "tbz %x[n_channels], #0, 91f\n"
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.h }[6], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.h }[6], [x23]\n"
+    "st1 { v25.h }[6], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.h }[6], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.h }[6], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "st1 { v28.h }[6], [x23]\n"
+    "st1 { v29.h }[6], [x22]\n"
+    "st1 { v30.h }[6], [x21]\n"
+    "st1 { v31.h }[6], [x20]\n"
+    "b 91f\n"
+    "88:"  // Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 91f\n"
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.h }[4], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.h }[4], [x23]\n"
+    "st1 { v25.h }[4], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.h }[4], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.h }[4], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "st1 { v28.h }[4], [x23]\n"
+    "st1 { v29.h }[4], [x22]\n"
+    "st1 { v30.h }[4], [x21]\n"
+    "st1 { v31.h }[4], [x20]\n"
+    "b 91f\n"
+    "89:"  // Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 90f\n"
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.s }[0], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.s }[0], [x23]\n"
+    "st1 { v25.s }[0], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.s }[0], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.s }[0], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "add x13, x13, #0x4\n"
+    "st1 { v28.s }[0], [x23]\n"
+    "st1 { v29.s }[0], [x22]\n"
+    "st1 { v30.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 91f\n"
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.h }[2], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.h }[2], [x23]\n"
+    "st1 { v25.h }[2], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.h }[2], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.h }[2], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "st1 { v28.h }[2], [x23]\n"
+    "st1 { v29.h }[2], [x22]\n"
+    "st1 { v30.h }[2], [x21]\n"
+    "st1 { v31.h }[2], [x20]\n"
+    "b 91f\n"
+    "90:"  // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.h }[0], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.h }[0], [x23]\n"
+    "st1 { v25.h }[0], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.h }[0], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.h }[0], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "st1 { v28.h }[0], [x23]\n"
+    "st1 { v29.h }[0], [x22]\n"
+    "st1 { v30.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "91:"  // Oddments: Store: Bit 2: End
+    "92:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..04fb532937
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 4;
+
+  a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..a1e1dd0e99
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,1736 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x27, #0x0\n"
+    "mov x26, #0x0\n"
+    "1:"  // Tile loop
+    "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x4\n"
+    "mov x23, #0x4\n"
+    "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x27, x24\n"  // offset = tile_i * ld_input_row
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x27, x22\n"  // offset = tile_i * ld_output_row
+    "mov x6, #0x10\n"  // cntb _, ALL, #1
+    "madd x21, x26, x4, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "lsl x4, x4, #0x1\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "madd x20, x26, x5, x20\n"  // offset += tile_j * ld_output_col
+    "lsl x5, x5, #0x1\n"
+    "add x17, x4, x4\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mul x21, x21, x25\n"  // offset *= kernel_stride * output_size
+    "add x7, x7, x21, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x15, x7, x24, LSL #1\n"
+    "mul x20, x20, x23\n"  // offset *= output_tile_size
+    "add x14, x15, x24, LSL #1\n"
+    "add x8, x8, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "lsr x13, %x[n_channels], #0x3\n"
+    "add x12, x14, x24, LSL #1\n"
+    "add x11, x17, x4\n"
+    "add x10, x8, x22, LSL #1\n"
+    "add x9, x12, x24, LSL #1\n"
+    "add x28, x11, x4\n"
+    "add x27, x10, x22, LSL #1\n"
+    "add x23, x5, x5\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v15.8h }, [x20]\n"
+    "add x26, x9, x24, LSL #1\n"
+    "add x25, x28, x4\n"
+    "add x24, x27, x22, LSL #1\n"
+    "add x22, x23, x5\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x6\n"
+    "cbz x13, 4f\n"
+    "ldr q14, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "cmp x6, x13, LSL #4\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "ldr q5, [x16, #0x60]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "ldr q7, [x16, #0x80]\n"
+    "ldr q8, [x16, #0x90]\n"
+    "add x16, x16, #0xa0\n"
+    "ldr q9, [x14, x17]\n"
+    "ld1 { v10.8h }, [x7]\n"
+    "ldr q11, [x7, x25]\n"
+    "ldr q12, [x14, x11]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v26.16b, v14.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+    "mov v28.16b, v14.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+    "add x6, x6, #0x10\n"
+    "cmp x6, x13, LSL #4\n"
+    "mov v16.16b, v14.16b\n fmla v16.8h, v3.8h, v9.8h\n"
+    "mov v22.16b, v14.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+    "add x20, x20, #0x10\n"
+    "add x21, x21, #0x10\n"
+    "mov v23.16b, v14.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+    "fmla v26.8h, v5.8h, v12.8h\n"
+    "mov v25.16b, v14.16b\n fmla v25.8h, v7.8h, v9.8h\n"
+    "mov v17.16b, v14.16b\n fmla v17.8h, v6.8h, v9.8h\n"
+    "mov v31.16b, v14.16b\n fmla v31.8h, v5.8h, v9.8h\n"
+    "mov v20.16b, v14.16b\n fmla v20.8h, v2.8h, v9.8h\n"
+    "ldr q9, [x12, x17]\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "ld1 { v30.8h }, [x26]\n"
+    "mov v29.16b, v14.16b\n fmla v29.8h, v2.8h, v11.8h\n"
+    "ldr q27, [x26, x25]\n"
+    "fmla v16.8h, v4.8h, v12.8h\n"
+    "fmla v22.8h, v2.8h, v12.8h\n"
+    "fmla v23.8h, v1.8h, v12.8h\n"
+    "mov v21.16b, v14.16b\n fmla v21.8h, v6.8h, v30.8h\n"
+    "ldr q10, [x12, x11]\n"
+    "fmla v26.8h, v7.8h, v9.8h\n"
+    "fmla v25.8h, v8.8h, v12.8h\n"
+    "fmla v17.8h, v7.8h, v12.8h\n"
+    "fmla v29.8h, v6.8h, v12.8h\n"
+    "mov v24.16b, v14.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+    "mov v19.16b, v14.16b\n fmla v19.8h, v0.8h, v12.8h\n"
+    "ldr q11, [x7, x4]\n"
+    "mov v30.16b, v14.16b\n fmla v30.8h, v8.8h, v27.8h\n"
+    "ldr q12, [x7, x28]\n"
+    "fmla v16.8h, v6.8h, v9.8h\n"
+    "fmla v22.8h, v4.8h, v9.8h\n"
+    "fmla v23.8h, v3.8h, v9.8h\n"
+    "mov v27.16b, v14.16b\n fmla v27.8h, v1.8h, v9.8h\n"
+    "mov v18.16b, v14.16b\n fmla v18.8h, v0.8h, v9.8h\n"
+    "ldr q14, [x16, #0x0]\n"
+    "fmla v31.8h, v8.8h, v9.8h\n"
+    "fmla v20.8h, v5.8h, v9.8h\n"
+    "fmla v21.8h, v2.8h, v9.8h\n"
+    "ld1 { v9.8h }, [x15]\n"
+    "fmla v26.8h, v8.8h, v10.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "fmla v25.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x15, x25]\n"
+    "fmla v17.8h, v2.8h, v12.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "ld1 { v12.8h }, [x9]\n"
+    "fmla v16.8h, v7.8h, v10.8h\n"
+    "fmla v24.8h, v6.8h, v10.8h\n"
+    "fmla v22.8h, v5.8h, v10.8h\n"
+    "fmla v23.8h, v4.8h, v10.8h\n"
+    "fmla v19.8h, v3.8h, v10.8h\n"
+    "fmla v27.8h, v2.8h, v10.8h\n"
+    "fmla v18.8h, v1.8h, v10.8h\n"
+    "fmla v30.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x15, x17]\n"
+    "fmla v31.8h, v0.8h, v9.8h\n"
+    "fmla v20.8h, v6.8h, v12.8h\n"
+    "fmla v21.8h, v3.8h, v12.8h\n"
+    "ldr q12, [x9, x25]\n"
+    "fmla v26.8h, v1.8h, v10.8h\n"
+    "fmla v28.8h, v3.8h, v9.8h\n"
+    "fmla v29.8h, v5.8h, v11.8h\n"
+    "fmla v24.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x15, x11]\n"
+    "fmla v25.8h, v4.8h, v10.8h\n"
+    "fmla v17.8h, v3.8h, v10.8h\n"
+    "fmla v16.8h, v0.8h, v10.8h\n"
+    "fmla v19.8h, v8.8h, v12.8h\n"
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "ldr q9, [x26, x4]\n"
+    "fmla v31.8h, v2.8h, v10.8h\n"
+    "fmla v26.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v5.8h, v10.8h\n"
+    "ldr q10, [x14, x4]\n"
+    "fmla v25.8h, v5.8h, v11.8h\n"
+    "fmla v17.8h, v4.8h, v11.8h\n"
+    "fmla v29.8h, v3.8h, v11.8h\n"
+    "fmla v16.8h, v1.8h, v11.8h\n"
+    "fmla v24.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x14, x28]\n"
+    "fmla v21.8h, v7.8h, v9.8h\n"
+    "fmla v27.8h, v6.8h, v9.8h\n"
+    "ldr q12, [x26, x28]\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "fmla v26.8h, v3.8h, v10.8h\n"
+    "fmla v20.8h, v1.8h, v10.8h\n"
+    "fmla v22.8h, v0.8h, v10.8h\n"
+    "fmla v28.8h, v7.8h, v10.8h\n"
+    "fmla v25.8h, v6.8h, v10.8h\n"
+    "ldr q10, [x7, x17]\n"
+    "fmla v18.8h, v8.8h, v12.8h\n"
+    "fmla v30.8h, v7.8h, v12.8h\n"
+    "ldr q9, [x12, x4]\n"
+    "fmla v17.8h, v8.8h, v11.8h\n"
+    "fmla v29.8h, v7.8h, v11.8h\n"
+    "fmla v16.8h, v5.8h, v11.8h\n"
+    "fmla v24.8h, v4.8h, v11.8h\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "fmla v19.8h, v1.8h, v11.8h\n"
+    "ldr q12, [x7, x11]\n"
+    "add x7, x7, #0x10\n"
+    "fmla v31.8h, v7.8h, v9.8h\n"
+    "fmla v26.8h, v6.8h, v9.8h\n"
+    "fmla v20.8h, v4.8h, v9.8h\n"
+    "fmla v22.8h, v3.8h, v9.8h\n"
+    "fmla v21.8h, v1.8h, v9.8h\n"
+    "fmla v27.8h, v0.8h, v9.8h\n"
+    "ldr q9, [x12, x28]\n"
+    "fmla v28.8h, v2.8h, v10.8h\n"
+    "fmla v25.8h, v1.8h, v10.8h\n"
+    "fmla v17.8h, v0.8h, v10.8h\n"
+    "ld1 { v10.8h }, [x14]\n"
+    "fmla v18.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v0.8h, v12.8h\n"
+    "fmla v31.8h, v3.8h, v10.8h\n"
+    "fmla v20.8h, v0.8h, v10.8h\n"
+    "fmla v16.8h, v8.8h, v9.8h\n"
+    "fmla v24.8h, v7.8h, v9.8h\n"
+    "fmla v23.8h, v5.8h, v9.8h\n"
+    "fmla v19.8h, v4.8h, v9.8h\n"
+    "fmla v30.8h, v1.8h, v9.8h\n"
+    "ldr q11, [x9, x17]\n"
+    "fmla v25.8h, v2.8h, v12.8h\n"
+    "fmla v17.8h, v1.8h, v12.8h\n"
+    "ldr q12, [x14, x25]\n"
+    "add x14, x14, #0x10\n"
+    "ldr q9, [x14, x17]\n"
+    "fmla v28.8h, v6.8h, v10.8h\n"
+    "ld1 { v10.8h }, [x12]\n"
+    "fmla v27.8h, v4.8h, v11.8h\n"
+    "fmla v18.8h, v3.8h, v11.8h\n"
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "fmla v24.8h, v5.8h, v12.8h\n"
+    "fmla v19.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x12, x25]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v31.8h, v6.8h, v10.8h\n"
+    "fmla v20.8h, v3.8h, v10.8h\n"
+    "fmla v21.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x26, x17]\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "fmla v27.8h, v7.8h, v10.8h\n"
+    "fmla v18.8h, v6.8h, v10.8h\n"
+    "fmla v20.8h, v8.8h, v11.8h\n"
+    "fmla v22.8h, v7.8h, v11.8h\n"
+    "fmla v23.8h, v6.8h, v11.8h\n"
+    "fmla v21.8h, v5.8h, v11.8h\n"
+    "ldr q11, [x9, x11]\n"
+    "fmla v19.8h, v5.8h, v12.8h\n"
+    "fmla v27.8h, v5.8h, v11.8h\n"
+    "fmla v18.8h, v4.8h, v11.8h\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "fmla v24.8h, v8.8h, v12.8h\n"
+    "ldr q12, [x26, x11]\n"
+    "fmla v21.8h, v8.8h, v10.8h\n"
+    "ldr q10, [x15, x4]\n"
+    "fmla v22.8h, v8.8h, v11.8h\n"
+    "fmla v23.8h, v7.8h, v11.8h\n"
+    "add x26, x26, #0x10\n"
+    "fmla v19.8h, v6.8h, v11.8h\n"
+    "ldr q11, [x15, x28]\n"
+    "fmla v27.8h, v8.8h, v12.8h\n"
+    "add x15, x15, #0x10\n"
+    "fmla v18.8h, v7.8h, v12.8h\n"
+    "fmla v30.8h, v6.8h, v12.8h\n"
+    "ldr q12, [x9, x4]\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "fmla v25.8h, v3.8h, v10.8h\n"
+    "fmax v28.8h, v28.8h, v13.8h\n"
+    "fmla v31.8h, v1.8h, v10.8h\n"
+    "fmla v26.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x9, x28]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "fmla v17.8h, v5.8h, v11.8h\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "fmax v25.8h, v25.8h, v13.8h\n"
+    "add x9, x9, #0x10\n"
+    "fmla v16.8h, v2.8h, v11.8h\n"
+    "ldr q2, [x16, #0x30]\n"
+    "fmla v24.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x7, x25]\n"
+    "ldr q1, [x16, #0x20]\n"
+    "fmla v20.8h, v7.8h, v12.8h\n"
+    "fmla v22.8h, v6.8h, v12.8h\n"
+    "ldr q6, [x16, #0x70]\n"
+    "fmla v21.8h, v4.8h, v12.8h\n"
+    "fmla v27.8h, v3.8h, v12.8h\n"
+    "ldr q12, [x14, x11]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "fmla v23.8h, v8.8h, v10.8h\n"
+    "ldr q8, [x16, #0x90]\n"
+    "fmla v19.8h, v7.8h, v10.8h\n"
+    "ldr q7, [x16, #0x80]\n"
+    "fmla v18.8h, v5.8h, v10.8h\n"
+    "ldr q5, [x16, #0x60]\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "ld1 { v10.8h }, [x7]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "fmax v17.8h, v17.8h, v13.8h\n"
+    "fmax v29.8h, v29.8h, v13.8h\n"
+    "add x16, x16, #0xa0\n"
+    "fmax v31.8h, v31.8h, v13.8h\n"
+    "fmax v26.8h, v26.8h, v13.8h\n"
+    "fmax v16.8h, v16.8h, v13.8h\n"
+    "fmax v24.8h, v24.8h, v13.8h\n"
+    "fmax v20.8h, v20.8h, v13.8h\n"
+    "fmax v22.8h, v22.8h, v13.8h\n"
+    "fmax v23.8h, v23.8h, v13.8h\n"
+    "fmax v19.8h, v19.8h, v13.8h\n"
+    "fmax v21.8h, v21.8h, v13.8h\n"
+    "fmax v27.8h, v27.8h, v13.8h\n"
+    "fmax v18.8h, v18.8h, v13.8h\n"
+    "fmax v30.8h, v30.8h, v13.8h\n"
+    "fmin v28.8h, v28.8h, v15.8h\n"
+    "fmin v25.8h, v25.8h, v15.8h\n"
+    "st1 { v28.8h }, [x8]\n"
+    "fmin v17.8h, v17.8h, v15.8h\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "str q25, [x8, x5]\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "fmin v26.8h, v26.8h, v15.8h\n"
+    "str q17, [x8, x23]\n"
+    "fmin v16.8h, v16.8h, v15.8h\n"
+    "fmin v24.8h, v24.8h, v15.8h\n"
+    "str q29, [x8, x22]\n"
+    "add x8, x8, #0x10\n"
+    "fmin v20.8h, v20.8h, v15.8h\n"
+    "fmin v22.8h, v22.8h, v15.8h\n"
+    "st1 { v31.8h }, [x10]\n"
+    "fmin v23.8h, v23.8h, v15.8h\n"
+    "fmin v19.8h, v19.8h, v15.8h\n"
+    "str q26, [x10, x5]\n"
+    "fmin v21.8h, v21.8h, v15.8h\n"
+    "fmin v27.8h, v27.8h, v15.8h\n"
+    "str q16, [x10, x23]\n"
+    "fmin v18.8h, v18.8h, v15.8h\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "str q24, [x10, x22]\n"
+    "add x10, x10, #0x10\n"
+    "st1 { v20.8h }, [x27]\n"
+    "str q22, [x27, x5]\n"
+    "str q23, [x27, x23]\n"
+    "str q19, [x27, x22]\n"
+    "add x27, x27, #0x10\n"
+    "st1 { v21.8h }, [x24]\n"
+    "str q27, [x24, x5]\n"
+    "str q18, [x24, x23]\n"
+    "str q30, [x24, x22]\n"
+    "add x24, x24, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v16.16b, v14.16b\n fmla v16.8h, v4.8h, v9.8h\n"
+    "mov v23.16b, v14.16b\n fmla v23.8h, v8.8h, v9.8h\n"
+    "mov v31.16b, v14.16b\n fmla v31.8h, v3.8h, v9.8h\n"
+    "mov v30.16b, v14.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+    "mov v18.16b, v14.16b\n fmla v18.8h, v0.8h, v9.8h\n"
+    "fmla v16.8h, v5.8h, v12.8h\n"
+    "mov v17.16b, v14.16b\n fmla v17.8h, v7.8h, v9.8h\n"
+    "mov v19.16b, v14.16b\n fmla v19.8h, v6.8h, v9.8h\n"
+    "mov v28.16b, v14.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+    "mov v27.16b, v14.16b\n fmla v27.8h, v2.8h, v9.8h\n"
+    "ldr q24, [x12, x17]\n"
+    "fmla v23.8h, v0.8h, v10.8h\n"
+    "ld1 { v21.8h }, [x26]\n"
+    "mov v29.16b, v14.16b\n fmla v29.8h, v2.8h, v11.8h\n"
+    "ldr q20, [x26, x25]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "fmla v18.8h, v1.8h, v12.8h\n"
+    "mov v26.16b, v14.16b\n fmla v26.8h, v6.8h, v21.8h\n"
+    "ldr q9, [x12, x11]\n"
+    "fmla v16.8h, v7.8h, v24.8h\n"
+    "fmla v17.8h, v8.8h, v12.8h\n"
+    "fmla v19.8h, v7.8h, v12.8h\n"
+    "fmla v29.8h, v6.8h, v12.8h\n"
+    "mov v11.16b, v14.16b\n fmla v11.8h, v3.8h, v12.8h\n"
+    "mov v10.16b, v14.16b\n fmla v10.8h, v0.8h, v12.8h\n"
+    "ldr q22, [x7, x4]\n"
+    "mov v25.16b, v14.16b\n fmla v25.8h, v8.8h, v20.8h\n"
+    "ldr q21, [x7, x28]\n"
+    "fmla v31.8h, v6.8h, v24.8h\n"
+    "fmla v30.8h, v4.8h, v24.8h\n"
+    "fmla v18.8h, v3.8h, v24.8h\n"
+    "mov v12.16b, v14.16b\n fmla v12.8h, v1.8h, v24.8h\n"
+    "fmla v14.8h, v0.8h, v24.8h\n"
+    "fmla v28.8h, v8.8h, v24.8h\n"
+    "fmla v27.8h, v5.8h, v24.8h\n"
+    "fmla v26.8h, v2.8h, v24.8h\n"
+    "ld1 { v24.8h }, [x15]\n"
+    "fmla v16.8h, v8.8h, v9.8h\n"
+    "fmla v23.8h, v1.8h, v22.8h\n"
+    "fmla v17.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x15, x25]\n"
+    "fmla v19.8h, v2.8h, v21.8h\n"
+    "fmla v29.8h, v1.8h, v21.8h\n"
+    "ld1 { v20.8h }, [x9]\n"
+    "fmla v31.8h, v7.8h, v9.8h\n"
+    "fmla v11.8h, v6.8h, v9.8h\n"
+    "fmla v30.8h, v5.8h, v9.8h\n"
+    "fmla v18.8h, v4.8h, v9.8h\n"
+    "fmla v10.8h, v3.8h, v9.8h\n"
+    "fmla v12.8h, v2.8h, v9.8h\n"
+    "fmla v14.8h, v1.8h, v9.8h\n"
+    "fmla v25.8h, v0.8h, v9.8h\n"
+    "ldr q21, [x15, x17]\n"
+    "fmla v28.8h, v0.8h, v24.8h\n"
+    "fmla v27.8h, v6.8h, v20.8h\n"
+    "fmla v26.8h, v3.8h, v20.8h\n"
+    "ldr q20, [x9, x25]\n"
+    "fmla v16.8h, v1.8h, v21.8h\n"
+    "fmla v23.8h, v3.8h, v24.8h\n"
+    "fmla v29.8h, v5.8h, v22.8h\n"
+    "fmla v11.8h, v2.8h, v22.8h\n"
+    "ldr q22, [x15, x11]\n"
+    "fmla v17.8h, v4.8h, v21.8h\n"
+    "fmla v19.8h, v3.8h, v21.8h\n"
+    "fmla v31.8h, v0.8h, v21.8h\n"
+    "fmla v10.8h, v8.8h, v20.8h\n"
+    "fmla v25.8h, v5.8h, v20.8h\n"
+    "ldr q20, [x26, x4]\n"
+    "fmla v28.8h, v2.8h, v21.8h\n"
+    "fmla v16.8h, v2.8h, v22.8h\n"
+    "fmla v23.8h, v5.8h, v21.8h\n"
+    "ldr q21, [x14, x4]\n"
+    "fmla v17.8h, v5.8h, v22.8h\n"
+    "fmla v19.8h, v4.8h, v22.8h\n"
+    "fmla v29.8h, v3.8h, v22.8h\n"
+    "fmla v31.8h, v1.8h, v22.8h\n"
+    "fmla v11.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x14, x28]\n"
+    "fmla v26.8h, v7.8h, v20.8h\n"
+    "fmla v12.8h, v6.8h, v20.8h\n"
+    "ldr q20, [x26, x28]\n"
+    "fmla v28.8h, v4.8h, v21.8h\n"
+    "fmla v16.8h, v3.8h, v21.8h\n"
+    "fmla v27.8h, v1.8h, v21.8h\n"
+    "fmla v30.8h, v0.8h, v21.8h\n"
+    "fmla v23.8h, v7.8h, v21.8h\n"
+    "fmla v17.8h, v6.8h, v21.8h\n"
+    "ldr q21, [x7, x17]\n"
+    "fmla v14.8h, v8.8h, v20.8h\n"
+    "fmla v25.8h, v7.8h, v20.8h\n"
+    "ldr q20, [x12, x4]\n"
+    "fmla v19.8h, v8.8h, v22.8h\n"
+    "fmla v29.8h, v7.8h, v22.8h\n"
+    "fmla v31.8h, v5.8h, v22.8h\n"
+    "fmla v11.8h, v4.8h, v22.8h\n"
+    "fmla v18.8h, v2.8h, v22.8h\n"
+    "fmla v10.8h, v1.8h, v22.8h\n"
+    "ldr q22, [x7, x11]\n"
+    "add x7, x7, #0x10\n"
+    "fmla v28.8h, v7.8h, v20.8h\n"
+    "fmla v16.8h, v6.8h, v20.8h\n"
+    "fmla v27.8h, v4.8h, v20.8h\n"
+    "fmla v30.8h, v3.8h, v20.8h\n"
+    "fmla v26.8h, v1.8h, v20.8h\n"
+    "fmla v12.8h, v0.8h, v20.8h\n"
+    "ldr q20, [x12, x28]\n"
+    "fmla v23.8h, v2.8h, v21.8h\n"
+    "fmla v17.8h, v1.8h, v21.8h\n"
+    "fmla v19.8h, v0.8h, v21.8h\n"
+    "ld1 { v21.8h }, [x14]\n"
+    "fmla v14.8h, v2.8h, v20.8h\n"
+    "fmla v29.8h, v0.8h, v22.8h\n"
+    "fmla v28.8h, v3.8h, v21.8h\n"
+    "fmla v27.8h, v0.8h, v21.8h\n"
+    "fmla v31.8h, v8.8h, v20.8h\n"
+    "fmla v11.8h, v7.8h, v20.8h\n"
+    "fmla v18.8h, v5.8h, v20.8h\n"
+    "fmla v10.8h, v4.8h, v20.8h\n"
+    "fmla v25.8h, v1.8h, v20.8h\n"
+    "ldr q24, [x9, x17]\n"
+    "fmla v17.8h, v2.8h, v22.8h\n"
+    "fmla v19.8h, v1.8h, v22.8h\n"
+    "ldr q20, [x14, x25]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v23.8h, v6.8h, v21.8h\n"
+    "ld1 { v21.8h }, [x12]\n"
+    "fmla v12.8h, v4.8h, v24.8h\n"
+    "fmla v14.8h, v3.8h, v24.8h\n"
+    "fmla v29.8h, v8.8h, v20.8h\n"
+    "fmla v11.8h, v5.8h, v20.8h\n"
+    "fmla v10.8h, v2.8h, v20.8h\n"
+    "ldr q20, [x12, x25]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v28.8h, v6.8h, v21.8h\n"
+    "fmla v27.8h, v3.8h, v21.8h\n"
+    "fmla v26.8h, v0.8h, v21.8h\n"
+    "ldr q22, [x26, x17]\n"
+    "fmla v25.8h, v2.8h, v20.8h\n"
+    "fmla v12.8h, v7.8h, v22.8h\n"
+    "fmla v14.8h, v6.8h, v22.8h\n"
+    "fmla v27.8h, v8.8h, v24.8h\n"
+    "fmla v30.8h, v7.8h, v24.8h\n"
+    "fmla v18.8h, v6.8h, v24.8h\n"
+    "fmla v26.8h, v5.8h, v24.8h\n"
+    "ldr q21, [x9, x11]\n"
+    "fmla v10.8h, v5.8h, v20.8h\n"
+    "fmla v12.8h, v5.8h, v21.8h\n"
+    "fmla v14.8h, v4.8h, v21.8h\n"
+    "fmla v25.8h, v3.8h, v21.8h\n"
+    "fmla v11.8h, v8.8h, v20.8h\n"
+    "ldr q20, [x26, x11]\n"
+    "fmla v26.8h, v8.8h, v22.8h\n"
+    "ldr q9, [x15, x4]\n"
+    "fmla v30.8h, v8.8h, v21.8h\n"
+    "fmla v18.8h, v7.8h, v21.8h\n"
+    "add x26, x26, #0x10\n"
+    "fmla v10.8h, v6.8h, v21.8h\n"
+    "ldr q21, [x15, x28]\n"
+    "fmla v12.8h, v8.8h, v20.8h\n"
+    "add x15, x15, #0x10\n"
+    "fmla v14.8h, v7.8h, v20.8h\n"
+    "fmla v25.8h, v6.8h, v20.8h\n"
+    "ldr q24, [x9, x4]\n"
+    "fmla v23.8h, v4.8h, v9.8h\n"
+    "fmla v17.8h, v3.8h, v9.8h\n"
+    "fmax v23.8h, v23.8h, v13.8h\n"
+    "fmla v28.8h, v1.8h, v9.8h\n"
+    "fmla v16.8h, v0.8h, v9.8h\n"
+    "ldr q0, [x9, x28]\n"
+    "fmax v17.8h, v17.8h, v13.8h\n"
+    "fmla v19.8h, v5.8h, v21.8h\n"
+    "fmla v29.8h, v4.8h, v21.8h\n"
+    "fmax v19.8h, v19.8h, v13.8h\n"
+    "add x9, x9, #0x10\n"
+    "fmla v31.8h, v2.8h, v21.8h\n"
+    "fmla v11.8h, v1.8h, v21.8h\n"
+    "fmax v29.8h, v29.8h, v13.8h\n"
+    "fmla v27.8h, v7.8h, v24.8h\n"
+    "fmla v30.8h, v6.8h, v24.8h\n"
+    "fmax v28.8h, v28.8h, v13.8h\n"
+    "fmla v26.8h, v4.8h, v24.8h\n"
+    "fmla v12.8h, v3.8h, v24.8h\n"
+    "fmax v16.8h, v16.8h, v13.8h\n"
+    "fmla v18.8h, v8.8h, v0.8h\n"
+    "fmla v10.8h, v7.8h, v0.8h\n"
+    "fmax v31.8h, v31.8h, v13.8h\n"
+    "fmla v14.8h, v5.8h, v0.8h\n"
+    "fmla v25.8h, v4.8h, v0.8h\n"
+    "fmax v11.8h, v11.8h, v13.8h\n"
+    "fmax v27.8h, v27.8h, v13.8h\n"
+    "fmax v30.8h, v30.8h, v13.8h\n"
+    "fmax v18.8h, v18.8h, v13.8h\n"
+    "fmax v10.8h, v10.8h, v13.8h\n"
+    "fmax v26.8h, v26.8h, v13.8h\n"
+    "fmax v12.8h, v12.8h, v13.8h\n"
+    "fmax v14.8h, v14.8h, v13.8h\n"
+    "fmax v25.8h, v25.8h, v13.8h\n"
+    "fmin v23.8h, v23.8h, v15.8h\n"
+    "fmin v17.8h, v17.8h, v15.8h\n"
+    "st1 { v23.8h }, [x8]\n"
+    "fmin v19.8h, v19.8h, v15.8h\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "str q17, [x8, x5]\n"
+    "fmin v28.8h, v28.8h, v15.8h\n"
+    "fmin v16.8h, v16.8h, v15.8h\n"
+    "str q19, [x8, x23]\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "fmin v11.8h, v11.8h, v15.8h\n"
+    "str q29, [x8, x22]\n"
+    "add x8, x8, #0x10\n"
+    "fmin v27.8h, v27.8h, v15.8h\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "st1 { v28.8h }, [x10]\n"
+    "fmin v18.8h, v18.8h, v15.8h\n"
+    "fmin v10.8h, v10.8h, v15.8h\n"
+    "str q16, [x10, x5]\n"
+    "fmin v26.8h, v26.8h, v15.8h\n"
+    "fmin v12.8h, v12.8h, v15.8h\n"
+    "str q31, [x10, x23]\n"
+    "fmin v14.8h, v14.8h, v15.8h\n"
+    "fmin v25.8h, v25.8h, v15.8h\n"
+    "str q11, [x10, x22]\n"
+    "add x10, x10, #0x10\n"
+    "st1 { v27.8h }, [x27]\n"
+    "str q30, [x27, x5]\n"
+    "str q18, [x27, x23]\n"
+    "str q10, [x27, x22]\n"
+    "add x27, x27, #0x10\n"
+    "st1 { v26.8h }, [x24]\n"
+    "str q12, [x24, x5]\n"
+    "str q14, [x24, x23]\n"
+    "str q25, [x24, x22]\n"
+    "add x24, x24, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 141f\n"
+    "ldr q14, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "add x23, x14, x17\n"
+    "add x22, x7, XZR\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "add x21, x7, x25\n"
+    "add x20, x14, x11\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "ldr q5, [x16, #0x60]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "ldr q7, [x16, #0x80]\n"
+    "ldr q8, [x16, #0x90]\n"
+    "tbz %x[n_channels], #2, 6f\n"
+    "ldr d9, [x23], #0x8\n"
+    "ldr d10, [x22], #0x8\n"
+    "ldr d11, [x21], #0x8\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v9.s }[2], [x23], #0x4\n"
+    "ld1 { v10.s }[2], [x22], #0x4\n"
+    "ld1 { v11.s }[2], [x21], #0x4\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[6], [x23]\n"
+    "ld1 { v10.h }[6], [x22]\n"
+    "ld1 { v11.h }[6], [x21]\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 8f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[4], [x23]\n"
+    "ld1 { v10.h }[4], [x22]\n"
+    "ld1 { v11.h }[4], [x21]\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 8f\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr s9, [x23], #0x4\n"
+    "ldr s10, [x22], #0x4\n"
+    "ldr s11, [x21], #0x4\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[2], [x23]\n"
+    "ld1 { v10.h }[2], [x22]\n"
+    "ld1 { v11.h }[2], [x21]\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x23, #0x0]\n"
+    "ldr h10, [x22, #0x0]\n"
+    "ldr h11, [x21, #0x0]\n"
+    "ldr h12, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: End
+    "mov v16.16b, v14.16b\n fmla v16.8h, v8.8h, v9.8h\n"
+    "mov v17.16b, v14.16b\n fmla v17.8h, v7.8h, v9.8h\n"
+    "add x20, x26, XZR\n"
+    "mov v18.16b, v14.16b\n fmla v18.8h, v6.8h, v9.8h\n"
+    "mov v21.16b, v14.16b\n fmla v21.8h, v4.8h, v9.8h\n"
+    "mov v22.16b, v14.16b\n fmla v22.8h, v3.8h, v9.8h\n"
+    "mov v25.16b, v14.16b\n fmla v25.8h, v1.8h, v9.8h\n"
+    "mov v26.16b, v14.16b\n fmla v26.8h, v0.8h, v9.8h\n"
+    "mov v19.16b, v14.16b\n fmla v19.8h, v2.8h, v11.8h\n"
+    "mov v20.16b, v14.16b\n fmla v20.8h, v5.8h, v9.8h\n"
+    "mov v24.16b, v14.16b\n fmla v24.8h, v2.8h, v9.8h\n"
+    "fmla v16.8h, v0.8h, v10.8h\n"
+    "fmla v17.8h, v8.8h, v12.8h\n"
+    "fmla v18.8h, v7.8h, v12.8h\n"
+    "fmla v19.8h, v6.8h, v12.8h\n"
+    "fmla v21.8h, v5.8h, v12.8h\n"
+    "fmla v22.8h, v4.8h, v12.8h\n"
+    "mov v23.16b, v14.16b\n fmla v23.8h, v3.8h, v12.8h\n"
+    "fmla v25.8h, v2.8h, v12.8h\n"
+    "fmla v26.8h, v1.8h, v12.8h\n"
+    "mov v27.16b, v14.16b\n fmla v27.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 10f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 12f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 12f\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: End
+    "mov v28.16b, v14.16b\n fmla v28.8h, v6.8h, v10.8h\n"
+    "add x20, x26, x25\n"
+    "tbz %x[n_channels], #2, 14f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 16f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 16f\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: End
+    "mov v31.16b, v14.16b\n fmla v31.8h, v8.8h, v11.8h\n"
+    "add x20, x12, x17\n"
+    "tbz %x[n_channels], #2, 18f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 20f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 20f\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
+    "fmla v20.8h, v8.8h, v9.8h\n"
+    "fmla v21.8h, v7.8h, v9.8h\n"
+    "add x20, x7, x4\n"
+    "fmla v22.8h, v6.8h, v9.8h\n"
+    "fmla v24.8h, v5.8h, v9.8h\n"
+    "fmla v25.8h, v4.8h, v9.8h\n"
+    "fmla v26.8h, v3.8h, v9.8h\n"
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "mov v29.16b, v14.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+    "mov v30.16b, v14.16b\n fmla v30.8h, v0.8h, v9.8h\n"
+    "tbz %x[n_channels], #2, 22f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 24f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 24f\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: End
+    "fmla v16.8h, v1.8h, v12.8h\n"
+    "fmla v17.8h, v0.8h, v12.8h\n"
+    "add x20, x7, x28\n"
+    "tbz %x[n_channels], #2, 26f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 28f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (0, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 28f\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (0, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (0, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (0, 4): Bit 2: End
+    "fmla v18.8h, v2.8h, v11.8h\n"
+    "fmla v19.8h, v1.8h, v11.8h\n"
+    "add x20, x12, x11\n"
+    "tbz %x[n_channels], #2, 30f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 32f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 32f\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
+    "fmla v21.8h, v8.8h, v10.8h\n"
+    "fmla v22.8h, v7.8h, v10.8h\n"
+    "add x20, x15, XZR\n"
+    "fmla v23.8h, v6.8h, v10.8h\n"
+    "fmla v25.8h, v5.8h, v10.8h\n"
+    "fmla v26.8h, v4.8h, v10.8h\n"
+    "fmla v27.8h, v3.8h, v10.8h\n"
+    "fmla v29.8h, v2.8h, v10.8h\n"
+    "fmla v30.8h, v1.8h, v10.8h\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 34f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 36f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 36f\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: End
+    "fmla v16.8h, v3.8h, v9.8h\n"
+    "fmla v20.8h, v0.8h, v9.8h\n"
+    "add x20, x15, x25\n"
+    "tbz %x[n_channels], #2, 38f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 40f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (1, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 40f\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (1, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (1, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (1, 5): Bit 2: End
+    "fmla v19.8h, v5.8h, v12.8h\n"
+    "fmla v23.8h, v2.8h, v12.8h\n"
+    "add x20, x9, XZR\n"
+    "tbz %x[n_channels], #2, 42f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 44f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 44f\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: End
+    "fmla v24.8h, v6.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v11.8h\n"
+    "add x20, x15, x17\n"
+    "tbz %x[n_channels], #2, 46f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 48f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 48f\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: End
+    "fmla v16.8h, v5.8h, v10.8h\n"
+    "fmla v17.8h, v4.8h, v10.8h\n"
+    "add x20, x9, x25\n"
+    "fmla v18.8h, v3.8h, v10.8h\n"
+    "fmla v20.8h, v2.8h, v10.8h\n"
+    "fmla v21.8h, v1.8h, v10.8h\n"
+    "fmla v22.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 50f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 52f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 52f\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: End
+    "fmla v27.8h, v8.8h, v11.8h\n"
+    "fmla v31.8h, v5.8h, v11.8h\n"
+    "add x20, x15, x11\n"
+    "tbz %x[n_channels], #2, 54f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 56f\n"
+    "53:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 56f\n"
+    "54:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 55f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "56:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
+    "fmla v17.8h, v5.8h, v12.8h\n"
+    "fmla v18.8h, v4.8h, v12.8h\n"
+    "add x20, x26, x4\n"
+    "fmla v19.8h, v3.8h, v12.8h\n"
+    "fmla v21.8h, v2.8h, v12.8h\n"
+    "fmla v22.8h, v1.8h, v12.8h\n"
+    "fmla v23.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 58f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 57f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 60f\n"
+    "57:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 60f\n"
+    "58:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 59f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 60f\n"
+    "59:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "60:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: End
+    "fmla v28.8h, v7.8h, v11.8h\n"
+    "fmla v29.8h, v6.8h, v11.8h\n"
+    "add x20, x14, x4\n"
+    "tbz %x[n_channels], #2, 62f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 61f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 64f\n"
+    "61:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 64f\n"
+    "62:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 63f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 64f\n"
+    "63:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "64:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
+    "fmla v16.8h, v7.8h, v10.8h\n"
+    "fmla v17.8h, v6.8h, v10.8h\n"
+    "add x20, x26, x28\n"
+    "fmla v20.8h, v4.8h, v10.8h\n"
+    "fmla v21.8h, v3.8h, v10.8h\n"
+    "fmla v24.8h, v1.8h, v10.8h\n"
+    "fmla v25.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 66f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 65f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 68f\n"
+    "65:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 68f\n"
+    "66:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 67f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 68f\n"
+    "67:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "68:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: End
+    "fmla v30.8h, v8.8h, v11.8h\n"
+    "fmla v31.8h, v7.8h, v11.8h\n"
+    "add x20, x14, x28\n"
+    "tbz %x[n_channels], #2, 70f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 69f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 72f\n"
+    "69:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 72f\n"
+    "70:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 71f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 72f\n"
+    "71:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "72:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
+    "fmla v18.8h, v8.8h, v12.8h\n"
+    "fmla v19.8h, v7.8h, v12.8h\n"
+    "add x20, x7, x17\n"
+    "fmla v22.8h, v5.8h, v12.8h\n"
+    "fmla v23.8h, v4.8h, v12.8h\n"
+    "fmla v26.8h, v2.8h, v12.8h\n"
+    "fmla v27.8h, v1.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 74f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 73f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 76f\n"
+    "73:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 76f\n"
+    "74:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 75f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 76f\n"
+    "75:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "76:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: End
+    "fmla v16.8h, v2.8h, v10.8h\n"
+    "fmla v17.8h, v1.8h, v10.8h\n"
+    "add x20, x12, x4\n"
+    "fmla v18.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 78f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 77f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 80f\n"
+    "77:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 80f\n"
+    "78:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 79f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 80f\n"
+    "79:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "80:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
+    "fmla v20.8h, v7.8h, v11.8h\n"
+    "fmla v21.8h, v6.8h, v11.8h\n"
+    "add x20, x7, x11\n"
+    "fmla v24.8h, v4.8h, v11.8h\n"
+    "fmla v25.8h, v3.8h, v11.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 82f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 81f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 84f\n"
+    "81:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 84f\n"
+    "82:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 83f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 84f\n"
+    "83:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "84:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: End
+    "fmla v17.8h, v2.8h, v12.8h\n"
+    "fmla v18.8h, v1.8h, v12.8h\n"
+    "add x20, x14, XZR\n"
+    "fmla v19.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 86f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 85f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 88f\n"
+    "85:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 88f\n"
+    "86:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 87f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 88f\n"
+    "87:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "88:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
+    "fmla v16.8h, v6.8h, v10.8h\n"
+    "fmla v20.8h, v3.8h, v10.8h\n"
+    "add x20, x12, x28\n"
+    "fmla v24.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 90f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 89f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 92f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 92f\n"
+    "89:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 92f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 92f\n"
+    "90:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 91f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 92f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 92f\n"
+    "91:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "92:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
+    "fmla v22.8h, v8.8h, v11.8h\n"
+    "fmla v23.8h, v7.8h, v11.8h\n"
+    "add x20, x14, x25\n"
+    "fmla v26.8h, v5.8h, v11.8h\n"
+    "fmla v27.8h, v4.8h, v11.8h\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v31.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 94f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 93f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 96f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 96f\n"
+    "93:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 96f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 96f\n"
+    "94:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 95f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 96f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 96f\n"
+    "95:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "96:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: End
+    "fmla v19.8h, v8.8h, v12.8h\n"
+    "fmla v23.8h, v5.8h, v12.8h\n"
+    "add x20, x12, XZR\n"
+    "fmla v27.8h, v2.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 98f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 97f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 100f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 100f\n"
+    "97:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 100f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 100f\n"
+    "98:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 99f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 100f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 100f\n"
+    "99:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "100:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
+    "fmla v20.8h, v6.8h, v10.8h\n"
+    "fmla v24.8h, v3.8h, v10.8h\n"
+    "add x20, x9, x17\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 102f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 101f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 104f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 104f\n"
+    "101:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 104f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 104f\n"
+    "102:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 103f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 104f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 104f\n"
+    "103:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "104:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
+    "fmla v24.8h, v8.8h, v11.8h\n"
+    "fmla v25.8h, v7.8h, v11.8h\n"
+    "add x20, x12, x25\n"
+    "fmla v26.8h, v6.8h, v11.8h\n"
+    "fmla v28.8h, v5.8h, v11.8h\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 106f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 105f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 108f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 108f\n"
+    "105:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 108f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 108f\n"
+    "106:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 107f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 108f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 108f\n"
+    "107:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "108:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: End
+    "fmla v23.8h, v8.8h, v12.8h\n"
+    "fmla v27.8h, v5.8h, v12.8h\n"
+    "add x20, x26, x17\n"
+    "fmla v31.8h, v2.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 110f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 109f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 112f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 112f\n"
+    "109:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 112f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 112f\n"
+    "110:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 111f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 112f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 112f\n"
+    "111:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "112:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: End
+    "fmla v28.8h, v8.8h, v10.8h\n"
+    "fmla v29.8h, v7.8h, v10.8h\n"
+    "add x20, x9, x11\n"
+    "fmla v30.8h, v6.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 114f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 113f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 116f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 116f\n"
+    "113:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 116f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 116f\n"
+    "114:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 115f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 116f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 116f\n"
+    "115:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "116:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
+    "fmla v25.8h, v8.8h, v11.8h\n"
+    "fmla v26.8h, v7.8h, v11.8h\n"
+    "add x20, x26, x11\n"
+    "fmla v27.8h, v6.8h, v11.8h\n"
+    "fmla v29.8h, v5.8h, v11.8h\n"
+    "fmla v30.8h, v4.8h, v11.8h\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 118f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 117f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 120f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 120f\n"
+    "117:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 120f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 120f\n"
+    "118:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 119f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 120f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 120f\n"
+    "119:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "120:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: End
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "fmla v30.8h, v7.8h, v12.8h\n"
+    "add x20, x15, x4\n"
+    "fmla v31.8h, v6.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 122f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 121f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 124f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 124f\n"
+    "121:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 124f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 124f\n"
+    "122:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 123f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 124f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 124f\n"
+    "123:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "124:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: End
+    "fmla v16.8h, v4.8h, v10.8h\n"
+    "fmla v17.8h, v3.8h, v10.8h\n"
+    "add x20, x15, x28\n"
+    "fmla v20.8h, v1.8h, v10.8h\n"
+    "fmla v21.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 126f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 125f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 128f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 128f\n"
+    "125:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 128f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 128f\n"
+    "126:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 127f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 128f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 128f\n"
+    "127:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "128:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
+    "fmla v18.8h, v5.8h, v11.8h\n"
+    "fmla v19.8h, v4.8h, v11.8h\n"
+    "add x20, x9, x4\n"
+    "fmla v22.8h, v2.8h, v11.8h\n"
+    "fmla v23.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 130f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 129f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 132f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 132f\n"
+    "129:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 132f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 132f\n"
+    "130:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 131f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 132f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 132f\n"
+    "131:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "132:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
+    "fmla v24.8h, v7.8h, v12.8h\n"
+    "fmla v25.8h, v6.8h, v12.8h\n"
+    "add x20, x9, x28\n"
+    "fmla v28.8h, v4.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 134f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 133f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 136f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 136f\n"
+    "133:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 136f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 136f\n"
+    "134:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 135f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 136f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 136f\n"
+    "135:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "136:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
+    "fmla v26.8h, v8.8h, v10.8h\n"
+    "fmla v27.8h, v7.8h, v10.8h\n"
+    "fmax v16.8h, v16.8h, v13.8h\n"
+    "fmla v30.8h, v5.8h, v10.8h\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "fmax v17.8h, v17.8h, v13.8h\n"
+    "fmax v18.8h, v18.8h, v13.8h\n"
+    "fmax v19.8h, v19.8h, v13.8h\n"
+    "fmax v20.8h, v20.8h, v13.8h\n"
+    "fmax v21.8h, v21.8h, v13.8h\n"
+    "fmax v22.8h, v22.8h, v13.8h\n"
+    "fmax v23.8h, v23.8h, v13.8h\n"
+    "fmax v24.8h, v24.8h, v13.8h\n"
+    "fmax v25.8h, v25.8h, v13.8h\n"
+    "fmax v26.8h, v26.8h, v13.8h\n"
+    "fmax v27.8h, v27.8h, v13.8h\n"
+    "fmax v28.8h, v28.8h, v13.8h\n"
+    "fmax v29.8h, v29.8h, v13.8h\n"
+    "fmax v30.8h, v30.8h, v13.8h\n"
+    "fmax v31.8h, v31.8h, v13.8h\n"
+    "fmin v16.8h, v16.8h, v15.8h\n"
+    "fmin v17.8h, v17.8h, v15.8h\n"
+    "fmin v18.8h, v18.8h, v15.8h\n"
+    "fmin v19.8h, v19.8h, v15.8h\n"
+    "fmin v20.8h, v20.8h, v15.8h\n"
+    "fmin v21.8h, v21.8h, v15.8h\n"
+    "fmin v22.8h, v22.8h, v15.8h\n"
+    "fmin v23.8h, v23.8h, v15.8h\n"
+    "fmin v24.8h, v24.8h, v15.8h\n"
+    "fmin v25.8h, v25.8h, v15.8h\n"
+    "fmin v26.8h, v26.8h, v15.8h\n"
+    "fmin v27.8h, v27.8h, v15.8h\n"
+    "fmin v28.8h, v28.8h, v15.8h\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "tbz %x[n_channels], #2, 138f\n"
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.d }[0], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.d }[0], [x22], x5\n"
+    "st1 { v24.d }[0], [x21], x5\n"
+    "add x8, x8, #0x8\n"
+    "add x10, x10, #0x8\n"
+    "st1 { v28.d }[0], [x20], x5\n"
+    "add x27, x27, #0x8\n"
+    "add x24, x24, #0x8\n"
+    "st1 { v17.d }[0], [x23], x5\n"
+    "st1 { v21.d }[0], [x22], x5\n"
+    "st1 { v25.d }[0], [x21], x5\n"
+    "st1 { v29.d }[0], [x20], x5\n"
+    "st1 { v18.d }[0], [x23], x5\n"
+    "st1 { v22.d }[0], [x22], x5\n"
+    "st1 { v26.d }[0], [x21], x5\n"
+    "st1 { v30.d }[0], [x20], x5\n"
+    "st1 { v19.d }[0], [x23]\n"
+    "st1 { v23.d }[0], [x22]\n"
+    "st1 { v27.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #1, 137f\n"
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.s }[2], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.s }[2], [x22], x5\n"
+    "st1 { v24.s }[2], [x21], x5\n"
+    "add x8, x8, #0x4\n"
+    "add x10, x10, #0x4\n"
+    "st1 { v28.s }[2], [x20], x5\n"
+    "add x27, x27, #0x4\n"
+    "add x24, x24, #0x4\n"
+    "st1 { v17.s }[2], [x23], x5\n"
+    "st1 { v21.s }[2], [x22], x5\n"
+    "st1 { v25.s }[2], [x21], x5\n"
+    "st1 { v29.s }[2], [x20], x5\n"
+    "st1 { v18.s }[2], [x23], x5\n"
+    "st1 { v22.s }[2], [x22], x5\n"
+    "st1 { v26.s }[2], [x21], x5\n"
+    "st1 { v30.s }[2], [x20], x5\n"
+    "st1 { v19.s }[2], [x23]\n"
+    "st1 { v23.s }[2], [x22]\n"
+    "st1 { v27.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "tbz %x[n_channels], #0, 140f\n"
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.h }[6], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.h }[6], [x22], x5\n"
+    "st1 { v24.h }[6], [x21], x5\n"
+    "st1 { v28.h }[6], [x20], x5\n"
+    "st1 { v17.h }[6], [x23], x5\n"
+    "st1 { v21.h }[6], [x22], x5\n"
+    "st1 { v25.h }[6], [x21], x5\n"
+    "st1 { v29.h }[6], [x20], x5\n"
+    "st1 { v18.h }[6], [x23], x5\n"
+    "st1 { v22.h }[6], [x22], x5\n"
+    "st1 { v26.h }[6], [x21], x5\n"
+    "st1 { v30.h }[6], [x20], x5\n"
+    "st1 { v19.h }[6], [x23]\n"
+    "st1 { v23.h }[6], [x22]\n"
+    "st1 { v27.h }[6], [x21]\n"
+    "st1 { v31.h }[6], [x20]\n"
+    "b 140f\n"
+    "137:"  // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 140f\n"
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.h }[4], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.h }[4], [x22], x5\n"
+    "st1 { v24.h }[4], [x21], x5\n"
+    "st1 { v28.h }[4], [x20], x5\n"
+    "st1 { v17.h }[4], [x23], x5\n"
+    "st1 { v21.h }[4], [x22], x5\n"
+    "st1 { v25.h }[4], [x21], x5\n"
+    "st1 { v29.h }[4], [x20], x5\n"
+    "st1 { v18.h }[4], [x23], x5\n"
+    "st1 { v22.h }[4], [x22], x5\n"
+    "st1 { v26.h }[4], [x21], x5\n"
+    "st1 { v30.h }[4], [x20], x5\n"
+    "st1 { v19.h }[4], [x23]\n"
+    "st1 { v23.h }[4], [x22]\n"
+    "st1 { v27.h }[4], [x21]\n"
+    "st1 { v31.h }[4], [x20]\n"
+    "b 140f\n"
+    "138:"  // Tile loop: Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 139f\n"
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.s }[0], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.s }[0], [x22], x5\n"
+    "st1 { v24.s }[0], [x21], x5\n"
+    "add x8, x8, #0x4\n"
+    "add x10, x10, #0x4\n"
+    "st1 { v28.s }[0], [x20], x5\n"
+    "add x27, x27, #0x4\n"
+    "add x24, x24, #0x4\n"
+    "st1 { v17.s }[0], [x23], x5\n"
+    "st1 { v21.s }[0], [x22], x5\n"
+    "st1 { v25.s }[0], [x21], x5\n"
+    "st1 { v29.s }[0], [x20], x5\n"
+    "st1 { v18.s }[0], [x23], x5\n"
+    "st1 { v22.s }[0], [x22], x5\n"
+    "st1 { v26.s }[0], [x21], x5\n"
+    "st1 { v30.s }[0], [x20], x5\n"
+    "st1 { v19.s }[0], [x23]\n"
+    "st1 { v23.s }[0], [x22]\n"
+    "st1 { v27.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 140f\n"
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.h }[2], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.h }[2], [x22], x5\n"
+    "st1 { v24.h }[2], [x21], x5\n"
+    "st1 { v28.h }[2], [x20], x5\n"
+    "st1 { v17.h }[2], [x23], x5\n"
+    "st1 { v21.h }[2], [x22], x5\n"
+    "st1 { v25.h }[2], [x21], x5\n"
+    "st1 { v29.h }[2], [x20], x5\n"
+    "st1 { v18.h }[2], [x23], x5\n"
+    "st1 { v22.h }[2], [x22], x5\n"
+    "st1 { v26.h }[2], [x21], x5\n"
+    "st1 { v30.h }[2], [x20], x5\n"
+    "st1 { v19.h }[2], [x23]\n"
+    "st1 { v23.h }[2], [x22]\n"
+    "st1 { v27.h }[2], [x21]\n"
+    "st1 { v31.h }[2], [x20]\n"
+    "b 140f\n"
+    "139:"  // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.h }[0], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.h }[0], [x22], x5\n"
+    "st1 { v24.h }[0], [x21], x5\n"
+    "st1 { v28.h }[0], [x20], x5\n"
+    "st1 { v17.h }[0], [x23], x5\n"
+    "st1 { v21.h }[0], [x22], x5\n"
+    "st1 { v25.h }[0], [x21], x5\n"
+    "st1 { v29.h }[0], [x20], x5\n"
+    "st1 { v18.h }[0], [x23], x5\n"
+    "st1 { v22.h }[0], [x22], x5\n"
+    "st1 { v26.h }[0], [x21], x5\n"
+    "st1 { v30.h }[0], [x20], x5\n"
+    "st1 { v19.h }[0], [x23]\n"
+    "st1 { v23.h }[0], [x22]\n"
+    "st1 { v27.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "140:"  // Tile loop: Oddments: Store: Bit 2: End
+    "141:"  // Tile loop: End
+    "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x26, x26, #0x1\n"
+    "add x21, x27, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x26, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x27, x27, x21, LT\n"
+    "csel x26, x26, XZR, LT\n"
+    "cmp x27, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..96feeeeece
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,2007 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[36];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[14];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[5];
+      inptrs[3] = input_ptrs[15];
+      inptrs[4] = input_ptrs[30];
+      inptrs[5] = input_ptrs[35];
+      inptrs[6] = input_ptrs[20];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[4];
+      inptrs[9] = input_ptrs[21];
+      inptrs[10] = input_ptrs[6];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[24];
+      inptrs[13] = input_ptrs[8];
+      inptrs[14] = input_ptrs[29];
+      inptrs[15] = input_ptrs[9];
+      inptrs[16] = input_ptrs[31];
+      inptrs[17] = input_ptrs[13];
+      inptrs[18] = input_ptrs[34];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[19];
+      inptrs[22] = input_ptrs[3];
+      inptrs[23] = input_ptrs[12];
+      inptrs[24] = input_ptrs[22];
+      inptrs[25] = input_ptrs[17];
+      inptrs[26] = input_ptrs[18];
+      inptrs[27] = input_ptrs[26];
+      inptrs[28] = input_ptrs[23];
+      inptrs[29] = input_ptrs[32];
+      inptrs[30] = input_ptrs[27];
+      inptrs[31] = input_ptrs[33];
+      inptrs[32] = input_ptrs[7];
+      inptrs[33] = input_ptrs[10];
+      inptrs[34] = input_ptrs[25];
+      inptrs[35] = input_ptrs[28];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "mov x6, #0x10\n"  // cntb _, ALL, #1
+    "lsr x7, %x[n_channels], #0x3\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v14.8h }, [x20]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "mov x15, #0x0\n"
+    "sub x14, XZR, x6\n"
+    "cbz x7, 3f\n"
+    "ldr q30, [x17, #0x0]\n"
+    "ldr q0, [x17, #0x10]\n"
+    "cmp x6, x7, LSL #4\n"
+    "ldr q1, [x17, #0x20]\n"
+    "ldr q2, [x17, #0x30]\n"
+    "ldr q3, [x17, #0x40]\n"
+    "ldr q4, [x17, #0x50]\n"
+    "ldr q5, [x17, #0x60]\n"
+    "ldr q6, [x17, #0x70]\n"
+    "ldr q7, [x17, #0x80]\n"
+    "ldr q8, [x17, #0x90]\n"
+    "add x17, x17, #0xa0\n"
+    "ldp x21, x20, [x16, #0x0]\n"
+    "ldr q9, [x21, x15]\n"
+    "ldr q10, [x20, x15]\n"
+    "ldp x21, x20, [x16, #0x10]\n"
+    "ldr q11, [x21, x15]\n"
+    "ldr q12, [x20, x15]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v23.16b, v30.16b\n fmla v23.8h, v4.8h, v9.8h\n"
+    "mov v17.16b, v30.16b\n fmla v17.8h, v8.8h, v9.8h\n"
+    "ldr x27, [x16, #0x20]\n"
+    "ldr x24, [x16, #0x30]\n"
+    "mov v25.16b, v30.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+    "mov v28.16b, v30.16b\n fmla v28.8h, v1.8h, v9.8h\n"
+    "ldr x23, [x16, #0x28]\n"
+    "ldr x22, [x16, #0x38]\n"
+    "mov v20.16b, v30.16b\n fmla v20.8h, v0.8h, v9.8h\n"
+    "mov v16.16b, v30.16b\n fmla v16.8h, v7.8h, v9.8h\n"
+    "ldr x26, [x16, #0x40]\n"
+    "ldr x20, [x16, #0x48]\n"
+    "mov v15.16b, v30.16b\n fmla v15.8h, v6.8h, v9.8h\n"
+    "fmla v23.8h, v5.8h, v12.8h\n"
+    "ldr x25, [x16, #0x50]\n"
+    "ldr x21, [x16, #0x58]\n"
+    "mov v27.16b, v30.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+    "mov v31.16b, v30.16b\n fmla v31.8h, v2.8h, v9.8h\n"
+    "ldr q9, [x24, x15]\n"
+    "ldr x13, [x16, #0x70]\n"
+    "fmla v17.8h, v0.8h, v10.8h\n"
+    "ldr q22, [x27, x15]\n"
+    "mov v10.16b, v30.16b\n fmla v10.8h, v2.8h, v11.8h\n"
+    "ldr q18, [x23, x15]\n"
+    "fmla v25.8h, v4.8h, v12.8h\n"
+    "fmla v28.8h, v2.8h, v12.8h\n"
+    "ldr x24, [x16, #0x60]\n"
+    "ldr x23, [x16, #0x68]\n"
+    "fmla v20.8h, v1.8h, v12.8h\n"
+    "fmla v16.8h, v8.8h, v12.8h\n"
+    "ldr x12, [x8, #0x0]\n"
+    "ldr x11, [x8, #0x8]\n"
+    "fmla v15.8h, v7.8h, v12.8h\n"
+    "mov v29.16b, v30.16b\n fmla v29.8h, v6.8h, v22.8h\n"
+    "ldr q22, [x20, x15]\n"
+    "ldr x28, [x16, #0x88]\n"
+    "fmla v23.8h, v7.8h, v9.8h\n"
+    "fmla v10.8h, v6.8h, v12.8h\n"
+    "ldr x10, [x8, #0x10]\n"
+    "ldr x9, [x8, #0x18]\n"
+    "mov v21.16b, v30.16b\n fmla v21.8h, v3.8h, v12.8h\n"
+    "mov v19.16b, v30.16b\n fmla v19.8h, v0.8h, v12.8h\n"
+    "ldr q11, [x22, x15]\n"
+    "ldr x22, [x16, #0x78]\n"
+    "mov v24.16b, v30.16b\n fmla v24.8h, v8.8h, v18.8h\n"
+    "ldr q12, [x26, x15]\n"
+    "fmla v25.8h, v6.8h, v9.8h\n"
+    "ldr x20, [x16, #0x80]\n"
+    "fmla v28.8h, v4.8h, v9.8h\n"
+    "fmla v20.8h, v3.8h, v9.8h\n"
+    "add x14, x14, #0x10\n"
+    "mov v26.16b, v30.16b\n fmla v26.8h, v1.8h, v9.8h\n"
+    "mov v18.16b, v30.16b\n fmla v18.8h, v0.8h, v9.8h\n"
+    "ldr q30, [x17, #0x0]\n"
+    "fmla v27.8h, v8.8h, v9.8h\n"
+    "fmla v31.8h, v5.8h, v9.8h\n"
+    "fmla v29.8h, v2.8h, v9.8h\n"
+    "ldr q9, [x25, x15]\n"
+    "fmla v17.8h, v1.8h, v11.8h\n"
+    "ldr x27, [x16, #0x90]\n"
+    "fmla v16.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x21, x15]\n"
+    "fmla v15.8h, v2.8h, v12.8h\n"
+    "ldr x21, [x16, #0x98]\n"
+    "fmla v23.8h, v8.8h, v22.8h\n"
+    "fmla v10.8h, v1.8h, v12.8h\n"
+    "ldr q12, [x24, x15]\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v25.8h, v7.8h, v22.8h\n"
+    "fmla v21.8h, v6.8h, v22.8h\n"
+    "fmla v28.8h, v5.8h, v22.8h\n"
+    "fmla v20.8h, v4.8h, v22.8h\n"
+    "fmla v19.8h, v3.8h, v22.8h\n"
+    "fmla v26.8h, v2.8h, v22.8h\n"
+    "fmla v18.8h, v1.8h, v22.8h\n"
+    "fmla v24.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x23, x15]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v17.8h, v3.8h, v9.8h\n"
+    "fmla v27.8h, v0.8h, v9.8h\n"
+    "fmla v31.8h, v6.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "ldr q9, [x13, x15]\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v16.8h, v4.8h, v22.8h\n"
+    "fmla v15.8h, v3.8h, v22.8h\n"
+    "fmla v23.8h, v1.8h, v22.8h\n"
+    "fmla v10.8h, v5.8h, v11.8h\n"
+    "fmla v21.8h, v2.8h, v11.8h\n"
+    "ldr q12, [x22, x15]\n"
+    "fmla v25.8h, v0.8h, v22.8h\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v19.8h, v8.8h, v9.8h\n"
+    "fmla v24.8h, v5.8h, v9.8h\n"
+    "ldr q11, [x20, x15]\n"
+    "ldr x22, [x16, #0xc0]\n"
+    "fmla v17.8h, v5.8h, v22.8h\n"
+    "fmla v27.8h, v2.8h, v22.8h\n"
+    "ldr q22, [x28, x15]\n"
+    "ldr x20, [x16, #0xc8]\n"
+    "fmla v16.8h, v5.8h, v12.8h\n"
+    "fmla v15.8h, v4.8h, v12.8h\n"
+    "fmla v23.8h, v2.8h, v12.8h\n"
+    "fmla v10.8h, v3.8h, v12.8h\n"
+    "fmla v25.8h, v1.8h, v12.8h\n"
+    "fmla v21.8h, v0.8h, v12.8h\n"
+    "ldr q9, [x21, x15]\n"
+    "ldr x28, [x16, #0xd8]\n"
+    "fmla v29.8h, v7.8h, v11.8h\n"
+    "fmla v26.8h, v6.8h, v11.8h\n"
+    "ldr q12, [x27, x15]\n"
+    "ldr x21, [x16, #0xd0]\n"
+    "fmla v17.8h, v7.8h, v22.8h\n"
+    "fmla v16.8h, v6.8h, v22.8h\n"
+    "fmla v27.8h, v4.8h, v22.8h\n"
+    "fmla v23.8h, v3.8h, v22.8h\n"
+    "fmla v31.8h, v1.8h, v22.8h\n"
+    "fmla v28.8h, v0.8h, v22.8h\n"
+    "ldr q11, [x26, x15]\n"
+    "ldr x27, [x16, #0xe0]\n"
+    "fmla v15.8h, v8.8h, v9.8h\n"
+    "fmla v18.8h, v8.8h, v12.8h\n"
+    "fmla v24.8h, v7.8h, v12.8h\n"
+    "ldr q12, [x25, x15]\n"
+    "fmla v19.8h, v1.8h, v9.8h\n"
+    "ldr x26, [x16, #0xe8]\n"
+    "fmla v10.8h, v7.8h, v9.8h\n"
+    "fmla v25.8h, v5.8h, v9.8h\n"
+    "fmla v21.8h, v4.8h, v9.8h\n"
+    "fmla v20.8h, v2.8h, v9.8h\n"
+    "ldr q9, [x24, x15]\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla v17.8h, v2.8h, v11.8h\n"
+    "fmla v16.8h, v1.8h, v11.8h\n"
+    "fmla v15.8h, v0.8h, v11.8h\n"
+    "ldr q22, [x23, x15]\n"
+    "fmla v27.8h, v7.8h, v12.8h\n"
+    "ldr x25, [x16, #0xf8]\n"
+    "fmla v23.8h, v6.8h, v12.8h\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "fmla v26.8h, v0.8h, v12.8h\n"
+    "ldr q11, [x22, x15]\n"
+    "fmla v19.8h, v4.8h, v11.8h\n"
+    "ldr x23, [x16, #0x100]\n"
+    "fmla v18.8h, v2.8h, v11.8h\n"
+    "fmla v16.8h, v2.8h, v9.8h\n"
+    "fmla v15.8h, v1.8h, v9.8h\n"
+    "fmla v10.8h, v0.8h, v9.8h\n"
+    "ldr q9, [x20, x15]\n"
+    "ldr x20, [x16, #0x108]\n"
+    "fmla v17.8h, v6.8h, v22.8h\n"
+    "fmla v27.8h, v3.8h, v22.8h\n"
+    "fmla v31.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x21, x15]\n"
+    "fmla v25.8h, v8.8h, v11.8h\n"
+    "ldr x22, [x16, #0x110]\n"
+    "fmla v21.8h, v7.8h, v11.8h\n"
+    "fmla v20.8h, v5.8h, v11.8h\n"
+    "fmla v24.8h, v1.8h, v11.8h\n"
+    "ldr q12, [x28, x15]\n"
+    "fmla v19.8h, v2.8h, v9.8h\n"
+    "ldr x21, [x16, #0x118]\n"
+    "fmla v29.8h, v0.8h, v22.8h\n"
+    "fmla v26.8h, v4.8h, v12.8h\n"
+    "fmla v18.8h, v3.8h, v12.8h\n"
+    "fmla v10.8h, v8.8h, v9.8h\n"
+    "fmla v21.8h, v5.8h, v9.8h\n"
+    "ldr q11, [x27, x15]\n"
+    "fmla v27.8h, v6.8h, v22.8h\n"
+    "fmla v31.8h, v3.8h, v22.8h\n"
+    "ldr q22, [x26, x15]\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "fmla v20.8h, v6.8h, v12.8h\n"
+    "fmla v29.8h, v5.8h, v12.8h\n"
+    "fmla v19.8h, v5.8h, v11.8h\n"
+    "fmla v24.8h, v2.8h, v11.8h\n"
+    "fmla v26.8h, v7.8h, v22.8h\n"
+    "fmla v18.8h, v6.8h, v22.8h\n"
+    "fmla v31.8h, v8.8h, v12.8h\n"
+    "ldr q12, [x24, x15]\n"
+    "fmla v29.8h, v8.8h, v22.8h\n"
+    "ldr q22, [x23, x15]\n"
+    "fmla v28.8h, v8.8h, v12.8h\n"
+    "fmla v20.8h, v7.8h, v12.8h\n"
+    "fmla v19.8h, v6.8h, v12.8h\n"
+    "fmla v26.8h, v5.8h, v12.8h\n"
+    "fmla v18.8h, v4.8h, v12.8h\n"
+    "fmla v24.8h, v3.8h, v12.8h\n"
+    "ldr q12, [x20, x15]\n"
+    "ldp x20, x24, [x16, #0x0]\n"
+    "ldr q9, [x20, x6]\n"
+    "fmla v21.8h, v8.8h, v11.8h\n"
+    "ldr q11, [x25, x15]\n"
+    "fmla v17.8h, v4.8h, v22.8h\n"
+    "fmla v16.8h, v3.8h, v22.8h\n"
+    "fmla v15.8h, v5.8h, v12.8h\n"
+    "fmax v17.8h, v17.8h, v13.8h\n"
+    "fmla v10.8h, v4.8h, v12.8h\n"
+    "fmla v26.8h, v8.8h, v11.8h\n"
+    "fmax v16.8h, v16.8h, v13.8h\n"
+    "fmla v18.8h, v7.8h, v11.8h\n"
+    "fmla v24.8h, v6.8h, v11.8h\n"
+    "ldr q11, [x22, x15]\n"
+    "fmax v15.8h, v15.8h, v13.8h\n"
+    "fmla v27.8h, v1.8h, v22.8h\n"
+    "fmla v23.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x21, x15]\n"
+    "ldr q0, [x17, #0x10]\n"
+    "fmla v25.8h, v2.8h, v12.8h\n"
+    "ldr q2, [x17, #0x30]\n"
+    "fmla v21.8h, v1.8h, v12.8h\n"
+    "ldr q1, [x17, #0x20]\n"
+    "fmax v10.8h, v10.8h, v13.8h\n"
+    "fmla v31.8h, v7.8h, v11.8h\n"
+    "fmla v28.8h, v6.8h, v11.8h\n"
+    "ldr q6, [x17, #0x70]\n"
+    "fmla v20.8h, v8.8h, v22.8h\n"
+    "ldr q8, [x17, #0x90]\n"
+    "fmla v19.8h, v7.8h, v22.8h\n"
+    "ldr q7, [x17, #0x80]\n"
+    "fmin v17.8h, v17.8h, v14.8h\n"
+    "fmin v16.8h, v16.8h, v14.8h\n"
+    "str q17, [x12, x14]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "fmin v15.8h, v15.8h, v14.8h\n"
+    "fmin v10.8h, v10.8h, v14.8h\n"
+    "str q16, [x11, x14]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "fmax v27.8h, v27.8h, v13.8h\n"
+    "fmax v23.8h, v23.8h, v13.8h\n"
+    "str q15, [x10, x14]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "fmax v25.8h, v25.8h, v13.8h\n"
+    "fmax v21.8h, v21.8h, v13.8h\n"
+    "str q10, [x9, x14]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "fmla v26.8h, v3.8h, v11.8h\n"
+    "ldr q3, [x17, #0x40]\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "fmla v18.8h, v5.8h, v22.8h\n"
+    "ldr q5, [x17, #0x60]\n"
+    "fmla v24.8h, v4.8h, v22.8h\n"
+    "ldr q10, [x24, x6]\n"
+    "ldr q4, [x17, #0x50]\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "str q27, [x23, x14]\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "fmax v31.8h, v31.8h, v13.8h\n"
+    "str q23, [x22, x14]\n"
+    "ldr x25, [x8, #0x40]\n"
+    "fmax v28.8h, v28.8h, v13.8h\n"
+    "fmax v20.8h, v20.8h, v13.8h\n"
+    "str q25, [x21, x14]\n"
+    "ldr x23, [x8, #0x48]\n"
+    "fmax v19.8h, v19.8h, v13.8h\n"
+    "str q21, [x20, x14]\n"
+    "ldr x22, [x8, #0x50]\n"
+    "ldr x24, [x8, #0x58]\n"
+    "ldp x21, x20, [x16, #0x10]\n"
+    "ldr q11, [x21, x6]\n"
+    "fmin v31.8h, v31.8h, v14.8h\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "ldr q12, [x20, x6]\n"
+    "fmin v20.8h, v20.8h, v14.8h\n"
+    "fmin v19.8h, v19.8h, v14.8h\n"
+    "str q31, [x25, x14]\n"
+    "fmax v29.8h, v29.8h, v13.8h\n"
+    "fmax v26.8h, v26.8h, v13.8h\n"
+    "str q28, [x23, x14]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "fmax v18.8h, v18.8h, v13.8h\n"
+    "fmax v24.8h, v24.8h, v13.8h\n"
+    "str q20, [x22, x14]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "str q19, [x24, x14]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x6, x6, #0x10\n"
+    "cmp x6, x7, LSL #4\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "add x15, x15, #0x10\n"
+    "fmin v18.8h, v18.8h, v14.8h\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "str q29, [x23, x14]\n"
+    "add x17, x17, #0xa0\n"
+    "str q26, [x22, x14]\n"
+    "str q18, [x21, x14]\n"
+    "str q24, [x20, x14]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v30.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+    "mov v17.16b, v30.16b\n fmla v17.8h, v8.8h, v9.8h\n"
+    "ldr x27, [x16, #0x20]\n"
+    "ldr x24, [x16, #0x30]\n"
+    "mov v15.16b, v30.16b\n fmla v15.8h, v3.8h, v9.8h\n"
+    "mov v29.16b, v30.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+    "ldr x23, [x16, #0x28]\n"
+    "ldr x22, [x16, #0x38]\n"
+    "mov v19.16b, v30.16b\n fmla v19.8h, v0.8h, v9.8h\n"
+    "mov v20.16b, v30.16b\n fmla v20.8h, v7.8h, v9.8h\n"
+    "ldr x26, [x16, #0x40]\n"
+    "ldr x21, [x16, #0x48]\n"
+    "mov v21.16b, v30.16b\n fmla v21.8h, v6.8h, v9.8h\n"
+    "fmla v31.8h, v5.8h, v12.8h\n"
+    "ldr x25, [x16, #0x50]\n"
+    "ldr x20, [x16, #0x58]\n"
+    "mov v18.16b, v30.16b\n fmla v18.8h, v5.8h, v9.8h\n"
+    "mov v27.16b, v30.16b\n fmla v27.8h, v2.8h, v9.8h\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr x13, [x16, #0x70]\n"
+    "fmla v17.8h, v0.8h, v10.8h\n"
+    "ldr q22, [x27, x15]\n"
+    "mov v28.16b, v30.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+    "ldr q16, [x23, x15]\n"
+    "fmla v15.8h, v4.8h, v12.8h\n"
+    "fmla v29.8h, v2.8h, v12.8h\n"
+    "ldr x24, [x16, #0x60]\n"
+    "ldr x23, [x16, #0x68]\n"
+    "fmla v19.8h, v1.8h, v12.8h\n"
+    "fmla v20.8h, v8.8h, v12.8h\n"
+    "ldr x12, [x8, #0x0]\n"
+    "ldr x11, [x8, #0x8]\n"
+    "fmla v21.8h, v7.8h, v12.8h\n"
+    "mov v10.16b, v30.16b\n fmla v10.8h, v6.8h, v22.8h\n"
+    "ldr q22, [x21, x15]\n"
+    "ldr x28, [x16, #0x88]\n"
+    "fmla v31.8h, v7.8h, v24.8h\n"
+    "fmla v28.8h, v6.8h, v12.8h\n"
+    "ldr x10, [x8, #0x10]\n"
+    "ldr x9, [x8, #0x18]\n"
+    "mov v9.16b, v30.16b\n fmla v9.8h, v3.8h, v12.8h\n"
+    "mov v11.16b, v30.16b\n fmla v11.8h, v0.8h, v12.8h\n"
+    "ldr q23, [x22, x15]\n"
+    "ldr x22, [x16, #0x78]\n"
+    "mov v12.16b, v30.16b\n fmla v12.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x26, x15]\n"
+    "fmla v15.8h, v6.8h, v24.8h\n"
+    "ldr x21, [x16, #0x80]\n"
+    "fmla v29.8h, v4.8h, v24.8h\n"
+    "fmla v19.8h, v3.8h, v24.8h\n"
+    "add x14, x14, #0x10\n"
+    "mov v26.16b, v30.16b\n fmla v26.8h, v1.8h, v24.8h\n"
+    "mov v25.16b, v30.16b\n fmla v25.8h, v0.8h, v24.8h\n"
+    "fmla v18.8h, v8.8h, v24.8h\n"
+    "fmla v27.8h, v5.8h, v24.8h\n"
+    "fmla v10.8h, v2.8h, v24.8h\n"
+    "ldr q24, [x25, x15]\n"
+    "fmla v17.8h, v1.8h, v23.8h\n"
+    "ldr x27, [x16, #0x90]\n"
+    "fmla v20.8h, v0.8h, v23.8h\n"
+    "ldr q23, [x20, x15]\n"
+    "fmla v21.8h, v2.8h, v16.8h\n"
+    "ldr x20, [x16, #0x98]\n"
+    "fmla v31.8h, v8.8h, v22.8h\n"
+    "fmla v28.8h, v1.8h, v16.8h\n"
+    "ldr q16, [x24, x15]\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v15.8h, v7.8h, v22.8h\n"
+    "fmla v9.8h, v6.8h, v22.8h\n"
+    "fmla v29.8h, v5.8h, v22.8h\n"
+    "fmla v19.8h, v4.8h, v22.8h\n"
+    "fmla v11.8h, v3.8h, v22.8h\n"
+    "fmla v26.8h, v2.8h, v22.8h\n"
+    "fmla v25.8h, v1.8h, v22.8h\n"
+    "fmla v12.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x23, x15]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v17.8h, v3.8h, v24.8h\n"
+    "fmla v18.8h, v0.8h, v24.8h\n"
+    "fmla v27.8h, v6.8h, v16.8h\n"
+    "fmla v10.8h, v3.8h, v16.8h\n"
+    "ldr q16, [x13, x15]\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v20.8h, v4.8h, v22.8h\n"
+    "fmla v21.8h, v3.8h, v22.8h\n"
+    "fmla v31.8h, v1.8h, v22.8h\n"
+    "fmla v28.8h, v5.8h, v23.8h\n"
+    "fmla v9.8h, v2.8h, v23.8h\n"
+    "ldr q23, [x22, x15]\n"
+    "fmla v15.8h, v0.8h, v22.8h\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v11.8h, v8.8h, v16.8h\n"
+    "fmla v12.8h, v5.8h, v16.8h\n"
+    "ldr q16, [x21, x15]\n"
+    "ldr x22, [x16, #0xc0]\n"
+    "fmla v17.8h, v5.8h, v22.8h\n"
+    "fmla v18.8h, v2.8h, v22.8h\n"
+    "ldr q22, [x28, x15]\n"
+    "ldr x21, [x16, #0xc8]\n"
+    "fmla v20.8h, v5.8h, v23.8h\n"
+    "fmla v21.8h, v4.8h, v23.8h\n"
+    "fmla v31.8h, v2.8h, v23.8h\n"
+    "fmla v28.8h, v3.8h, v23.8h\n"
+    "fmla v15.8h, v1.8h, v23.8h\n"
+    "fmla v9.8h, v0.8h, v23.8h\n"
+    "ldr q23, [x20, x15]\n"
+    "ldr x28, [x16, #0xd8]\n"
+    "fmla v10.8h, v7.8h, v16.8h\n"
+    "fmla v26.8h, v6.8h, v16.8h\n"
+    "ldr q16, [x27, x15]\n"
+    "ldr x20, [x16, #0xd0]\n"
+    "fmla v17.8h, v7.8h, v22.8h\n"
+    "fmla v20.8h, v6.8h, v22.8h\n"
+    "fmla v18.8h, v4.8h, v22.8h\n"
+    "fmla v31.8h, v3.8h, v22.8h\n"
+    "fmla v27.8h, v1.8h, v22.8h\n"
+    "fmla v29.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x26, x15]\n"
+    "ldr x27, [x16, #0xe0]\n"
+    "fmla v21.8h, v8.8h, v23.8h\n"
+    "fmla v25.8h, v8.8h, v16.8h\n"
+    "fmla v12.8h, v7.8h, v16.8h\n"
+    "ldr q16, [x25, x15]\n"
+    "fmla v11.8h, v1.8h, v23.8h\n"
+    "ldr x26, [x16, #0xe8]\n"
+    "fmla v28.8h, v7.8h, v23.8h\n"
+    "fmla v15.8h, v5.8h, v23.8h\n"
+    "fmla v9.8h, v4.8h, v23.8h\n"
+    "fmla v19.8h, v2.8h, v23.8h\n"
+    "ldr q23, [x24, x15]\n"
+    "ldr x25, [x16, #0xf0]\n"
+    "fmla v17.8h, v2.8h, v22.8h\n"
+    "fmla v20.8h, v1.8h, v22.8h\n"
+    "fmla v21.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x23, x15]\n"
+    "fmla v18.8h, v7.8h, v16.8h\n"
+    "ldr x24, [x16, #0xf8]\n"
+    "fmla v31.8h, v6.8h, v16.8h\n"
+    "fmla v27.8h, v4.8h, v16.8h\n"
+    "fmla v29.8h, v3.8h, v16.8h\n"
+    "fmla v10.8h, v1.8h, v16.8h\n"
+    "fmla v26.8h, v0.8h, v16.8h\n"
+    "ldr q16, [x22, x15]\n"
+    "fmla v11.8h, v4.8h, v16.8h\n"
+    "ldr x23, [x16, #0x100]\n"
+    "fmla v25.8h, v2.8h, v16.8h\n"
+    "fmla v20.8h, v2.8h, v23.8h\n"
+    "fmla v21.8h, v1.8h, v23.8h\n"
+    "fmla v28.8h, v0.8h, v23.8h\n"
+    "ldr q23, [x21, x15]\n"
+    "ldr x22, [x16, #0x108]\n"
+    "fmla v17.8h, v6.8h, v22.8h\n"
+    "fmla v18.8h, v3.8h, v22.8h\n"
+    "fmla v27.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x20, x15]\n"
+    "fmla v15.8h, v8.8h, v16.8h\n"
+    "ldr x21, [x16, #0x110]\n"
+    "fmla v9.8h, v7.8h, v16.8h\n"
+    "fmla v19.8h, v5.8h, v16.8h\n"
+    "fmla v12.8h, v1.8h, v16.8h\n"
+    "ldr q16, [x28, x15]\n"
+    "fmla v11.8h, v2.8h, v23.8h\n"
+    "ldr x20, [x16, #0x118]\n"
+    "fmla v10.8h, v0.8h, v22.8h\n"
+    "fmla v26.8h, v4.8h, v16.8h\n"
+    "fmla v25.8h, v3.8h, v16.8h\n"
+    "fmla v28.8h, v8.8h, v23.8h\n"
+    "fmla v9.8h, v5.8h, v23.8h\n"
+    "ldr q23, [x27, x15]\n"
+    "fmla v18.8h, v6.8h, v22.8h\n"
+    "fmla v27.8h, v3.8h, v22.8h\n"
+    "ldr q22, [x26, x15]\n"
+    "fmla v29.8h, v7.8h, v16.8h\n"
+    "fmla v19.8h, v6.8h, v16.8h\n"
+    "fmla v10.8h, v5.8h, v16.8h\n"
+    "fmla v11.8h, v5.8h, v23.8h\n"
+    "fmla v12.8h, v2.8h, v23.8h\n"
+    "fmla v26.8h, v7.8h, v22.8h\n"
+    "fmla v25.8h, v6.8h, v22.8h\n"
+    "fmla v27.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x25, x15]\n"
+    "fmla v10.8h, v8.8h, v22.8h\n"
+    "ldr q30, [x23, x15]\n"
+    "fmla v29.8h, v8.8h, v16.8h\n"
+    "fmla v19.8h, v7.8h, v16.8h\n"
+    "fmla v11.8h, v6.8h, v16.8h\n"
+    "fmla v26.8h, v5.8h, v16.8h\n"
+    "fmla v25.8h, v4.8h, v16.8h\n"
+    "fmla v12.8h, v3.8h, v16.8h\n"
+    "ldr q24, [x22, x15]\n"
+    "fmla v9.8h, v8.8h, v23.8h\n"
+    "ldr q16, [x24, x15]\n"
+    "fmla v17.8h, v4.8h, v30.8h\n"
+    "fmax v17.8h, v17.8h, v13.8h\n"
+    "fmla v20.8h, v3.8h, v30.8h\n"
+    "fmla v21.8h, v5.8h, v24.8h\n"
+    "fmax v20.8h, v20.8h, v13.8h\n"
+    "fmla v28.8h, v4.8h, v24.8h\n"
+    "fmla v26.8h, v8.8h, v16.8h\n"
+    "fmax v21.8h, v21.8h, v13.8h\n"
+    "fmla v25.8h, v7.8h, v16.8h\n"
+    "fmla v12.8h, v6.8h, v16.8h\n"
+    "ldr q23, [x21, x15]\n"
+    "fmax v28.8h, v28.8h, v13.8h\n"
+    "fmla v18.8h, v1.8h, v30.8h\n"
+    "fmla v31.8h, v0.8h, v30.8h\n"
+    "ldr q16, [x20, x15]\n"
+    "fmin v17.8h, v17.8h, v14.8h\n"
+    "fmla v15.8h, v2.8h, v24.8h\n"
+    "fmla v9.8h, v1.8h, v24.8h\n"
+    "fmin v20.8h, v20.8h, v14.8h\n"
+    "str q17, [x12, x14]\n"
+    "fmla v27.8h, v7.8h, v23.8h\n"
+    "fmla v29.8h, v6.8h, v23.8h\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "str q20, [x11, x14]\n"
+    "fmla v19.8h, v8.8h, v16.8h\n"
+    "fmla v11.8h, v7.8h, v16.8h\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "str q21, [x10, x14]\n"
+    "fmax v18.8h, v18.8h, v13.8h\n"
+    "fmax v31.8h, v31.8h, v13.8h\n"
+    "str q28, [x9, x14]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "fmax v15.8h, v15.8h, v13.8h\n"
+    "fmax v9.8h, v9.8h, v13.8h\n"
+    "ldr x22, [x8, #0x28]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "fmla v10.8h, v4.8h, v23.8h\n"
+    "fmla v26.8h, v3.8h, v23.8h\n"
+    "fmin v18.8h, v18.8h, v14.8h\n"
+    "fmla v25.8h, v5.8h, v16.8h\n"
+    "fmla v12.8h, v4.8h, v16.8h\n"
+    "fmin v31.8h, v31.8h, v14.8h\n"
+    "str q18, [x23, x14]\n"
+    "fmin v15.8h, v15.8h, v14.8h\n"
+    "fmin v9.8h, v9.8h, v14.8h\n"
+    "str q31, [x22, x14]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "fmax v27.8h, v27.8h, v13.8h\n"
+    "fmax v29.8h, v29.8h, v13.8h\n"
+    "str q15, [x21, x14]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "fmax v19.8h, v19.8h, v13.8h\n"
+    "fmax v11.8h, v11.8h, v13.8h\n"
+    "str q9, [x20, x14]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "str q27, [x23, x14]\n"
+    "fmin v19.8h, v19.8h, v14.8h\n"
+    "fmin v11.8h, v11.8h, v14.8h\n"
+    "str q29, [x22, x14]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "fmax v10.8h, v10.8h, v13.8h\n"
+    "fmax v26.8h, v26.8h, v13.8h\n"
+    "str q19, [x21, x14]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "fmax v25.8h, v25.8h, v13.8h\n"
+    "fmax v12.8h, v12.8h, v13.8h\n"
+    "str q11, [x20, x14]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "fmin v10.8h, v10.8h, v14.8h\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "str q10, [x23, x14]\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "fmin v12.8h, v12.8h, v14.8h\n"
+    "str q26, [x22, x14]\n"
+    "add x15, x15, #0x10\n"
+    "str q25, [x21, x14]\n"
+    "str q12, [x20, x14]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 140f\n"
+    "ldr q30, [x17, #0x0]\n"
+    "ldr q0, [x17, #0x10]\n"
+    "mov x14, x15\n"
+    "ldr q1, [x17, #0x20]\n"
+    "ldr q2, [x17, #0x30]\n"
+    "ldr q3, [x17, #0x40]\n"
+    "ldr q4, [x17, #0x50]\n"
+    "ldr q5, [x17, #0x60]\n"
+    "ldr q6, [x17, #0x70]\n"
+    "ldr q7, [x17, #0x80]\n"
+    "ldr q8, [x17, #0x90]\n"
+    "ldr x23, [x16, #0x0]\n"
+    "ldr x22, [x16, #0x8]\n"
+    "add x23, x23, x15\n"
+    "add x22, x22, x15\n"
+    "ldr x21, [x16, #0x10]\n"
+    "ldr x20, [x16, #0x18]\n"
+    "add x21, x21, x15\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 5f\n"
+    "ld1 { v9.d }[0], [x23], #0x8\n"
+    "ld1 { v10.d }[0], [x22], #0x8\n"
+    "ld1 { v11.d }[0], [x21], #0x8\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.s }[2], [x23], #0x4\n"
+    "ld1 { v10.s }[2], [x22], #0x4\n"
+    "ld1 { v11.s }[2], [x21], #0x4\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[6], [x23], #0x2\n"
+    "ld1 { v10.h }[6], [x22], #0x2\n"
+    "ld1 { v11.h }[6], [x21], #0x2\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[4], [x23], #0x2\n"
+    "ld1 { v10.h }[4], [x22], #0x2\n"
+    "ld1 { v11.h }[4], [x21], #0x2\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v9.s }[0], [x23], #0x4\n"
+    "ld1 { v10.s }[0], [x22], #0x4\n"
+    "ld1 { v11.s }[0], [x21], #0x4\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[2], [x23], #0x2\n"
+    "ld1 { v10.h }[2], [x22], #0x2\n"
+    "ld1 { v11.h }[2], [x21], #0x2\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x23], #0x2\n"
+    "ld1 { v10.h }[0], [x22], #0x2\n"
+    "ld1 { v11.h }[0], [x21], #0x2\n"
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "7:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: End
+    "mov v16.16b, v30.16b\n fmla v16.8h, v8.8h, v9.8h\n"
+    "mov v17.16b, v30.16b\n fmla v17.8h, v7.8h, v9.8h\n"
+    "ldr x20, [x16, #0x20]\n"
+    "add x20, x20, x15\n"
+    "mov v18.16b, v30.16b\n fmla v18.8h, v6.8h, v9.8h\n"
+    "mov v21.16b, v30.16b\n fmla v21.8h, v4.8h, v9.8h\n"
+    "mov v22.16b, v30.16b\n fmla v22.8h, v3.8h, v9.8h\n"
+    "mov v25.16b, v30.16b\n fmla v25.8h, v1.8h, v9.8h\n"
+    "mov v26.16b, v30.16b\n fmla v26.8h, v0.8h, v9.8h\n"
+    "mov v19.16b, v30.16b\n fmla v19.8h, v2.8h, v11.8h\n"
+    "mov v20.16b, v30.16b\n fmla v20.8h, v5.8h, v9.8h\n"
+    "mov v24.16b, v30.16b\n fmla v24.8h, v2.8h, v9.8h\n"
+    "fmla v16.8h, v0.8h, v10.8h\n"
+    "fmla v17.8h, v8.8h, v12.8h\n"
+    "fmla v18.8h, v7.8h, v12.8h\n"
+    "fmla v19.8h, v6.8h, v12.8h\n"
+    "fmla v21.8h, v5.8h, v12.8h\n"
+    "fmla v22.8h, v4.8h, v12.8h\n"
+    "mov v23.16b, v30.16b\n fmla v23.8h, v3.8h, v12.8h\n"
+    "fmla v25.8h, v2.8h, v12.8h\n"
+    "fmla v26.8h, v1.8h, v12.8h\n"
+    "mov v27.16b, v30.16b\n fmla v27.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 9f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 11f\n"
+    "8:"  // Oddments: Load input (5, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 11f\n"
+    "9:"  // Oddments: Load input (5, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (5, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "11:"  // Oddments: Load input (5, 0): Bit 2: End
+    "ldr x20, [x16, #0x28]\n"
+    "mov v28.16b, v30.16b\n fmla v28.8h, v6.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 13f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load input (5, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load input (5, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (5, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "15:"  // Oddments: Load input (5, 5): Bit 2: End
+    "ldr x20, [x16, #0x30]\n"
+    "mov v31.16b, v30.16b\n fmla v31.8h, v8.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 17f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load input (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "19:"  // Oddments: Load input (3, 2): Bit 2: End
+    "ldr x20, [x16, #0x38]\n"
+    "fmla v20.8h, v8.8h, v9.8h\n"
+    "fmla v21.8h, v7.8h, v9.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v22.8h, v6.8h, v9.8h\n"
+    "fmla v24.8h, v5.8h, v9.8h\n"
+    "fmla v25.8h, v4.8h, v9.8h\n"
+    "fmla v26.8h, v3.8h, v9.8h\n"
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "mov v29.16b, v30.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+    "fmla v30.8h, v0.8h, v9.8h\n"
+    "tbz %x[n_channels], #2, 21f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load input (0, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load input (0, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (0, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "23:"  // Oddments: Load input (0, 1): Bit 2: End
+    "ldr x20, [x16, #0x40]\n"
+    "fmla v16.8h, v1.8h, v12.8h\n"
+    "fmla v17.8h, v0.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 25f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load input (0, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load input (0, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (0, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "27:"  // Oddments: Load input (0, 4): Bit 2: End
+    "ldr x20, [x16, #0x48]\n"
+    "fmla v18.8h, v2.8h, v11.8h\n"
+    "fmla v19.8h, v1.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 29f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load input (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "31:"  // Oddments: Load input (3, 3): Bit 2: End
+    "ldr x20, [x16, #0x50]\n"
+    "fmla v21.8h, v8.8h, v10.8h\n"
+    "fmla v22.8h, v7.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v23.8h, v6.8h, v10.8h\n"
+    "fmla v25.8h, v5.8h, v10.8h\n"
+    "fmla v26.8h, v4.8h, v10.8h\n"
+    "fmla v27.8h, v3.8h, v10.8h\n"
+    "fmla v29.8h, v2.8h, v10.8h\n"
+    "fmla v30.8h, v1.8h, v10.8h\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 33f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load input (1, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load input (1, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (1, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "35:"  // Oddments: Load input (1, 0): Bit 2: End
+    "ldr x20, [x16, #0x58]\n"
+    "fmla v16.8h, v3.8h, v9.8h\n"
+    "fmla v20.8h, v0.8h, v9.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 37f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load input (1, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load input (1, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (1, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "39:"  // Oddments: Load input (1, 5): Bit 2: End
+    "ldr x20, [x16, #0x60]\n"
+    "fmla v19.8h, v5.8h, v12.8h\n"
+    "fmla v23.8h, v2.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 41f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load input (4, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load input (4, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (4, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "43:"  // Oddments: Load input (4, 0): Bit 2: End
+    "ldr x20, [x16, #0x68]\n"
+    "fmla v24.8h, v6.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 45f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load input (1, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load input (1, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (1, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "47:"  // Oddments: Load input (1, 2): Bit 2: End
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v16.8h, v5.8h, v10.8h\n"
+    "fmla v17.8h, v4.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v18.8h, v3.8h, v10.8h\n"
+    "fmla v20.8h, v2.8h, v10.8h\n"
+    "fmla v21.8h, v1.8h, v10.8h\n"
+    "fmla v22.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 49f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load input (4, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load input (4, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (4, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "51:"  // Oddments: Load input (4, 5): Bit 2: End
+    "ldr x20, [x16, #0x78]\n"
+    "fmla v27.8h, v8.8h, v11.8h\n"
+    "fmla v31.8h, v5.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 53f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load input (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 54f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "55:"  // Oddments: Load input (1, 3): Bit 2: End
+    "ldr x20, [x16, #0x80]\n"
+    "fmla v17.8h, v5.8h, v12.8h\n"
+    "fmla v18.8h, v4.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v19.8h, v3.8h, v12.8h\n"
+    "fmla v21.8h, v2.8h, v12.8h\n"
+    "fmla v22.8h, v1.8h, v12.8h\n"
+    "fmla v23.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 57f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 56f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load input (5, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load input (5, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 58f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load input (5, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "59:"  // Oddments: Load input (5, 1): Bit 2: End
+    "ldr x20, [x16, #0x88]\n"
+    "fmla v28.8h, v7.8h, v11.8h\n"
+    "fmla v29.8h, v6.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 61f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 60f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load input (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load input (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 62f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "63:"  // Oddments: Load input (2, 1): Bit 2: End
+    "ldr x20, [x16, #0x90]\n"
+    "fmla v16.8h, v7.8h, v10.8h\n"
+    "fmla v17.8h, v6.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v20.8h, v4.8h, v10.8h\n"
+    "fmla v21.8h, v3.8h, v10.8h\n"
+    "fmla v24.8h, v1.8h, v10.8h\n"
+    "fmla v25.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 65f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 64f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load input (5, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load input (5, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 66f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load input (5, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "67:"  // Oddments: Load input (5, 4): Bit 2: End
+    "ldr x20, [x16, #0x98]\n"
+    "fmla v30.8h, v8.8h, v11.8h\n"
+    "fmla v31.8h, v7.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 69f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 68f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load input (2, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load input (2, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 70f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "71:"  // Oddments: Load input (2, 4): Bit 2: End
+    "ldr x20, [x16, #0xa0]\n"
+    "fmla v18.8h, v8.8h, v12.8h\n"
+    "fmla v19.8h, v7.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v22.8h, v5.8h, v12.8h\n"
+    "fmla v23.8h, v4.8h, v12.8h\n"
+    "fmla v26.8h, v2.8h, v12.8h\n"
+    "fmla v27.8h, v1.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 73f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 72f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load input (0, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load input (0, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 74f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load input (0, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "75:"  // Oddments: Load input (0, 2): Bit 2: End
+    "ldr x20, [x16, #0xa8]\n"
+    "fmla v16.8h, v2.8h, v10.8h\n"
+    "fmla v17.8h, v1.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v18.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 77f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 76f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load input (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 78f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "79:"  // Oddments: Load input (3, 1): Bit 2: End
+    "ldr x20, [x16, #0xb0]\n"
+    "fmla v20.8h, v7.8h, v11.8h\n"
+    "fmla v21.8h, v6.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v24.8h, v4.8h, v11.8h\n"
+    "fmla v25.8h, v3.8h, v11.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 81f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 80f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load input (0, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load input (0, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 82f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load input (0, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "83:"  // Oddments: Load input (0, 3): Bit 2: End
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla v17.8h, v2.8h, v12.8h\n"
+    "fmla v18.8h, v1.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v19.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 85f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 84f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 87f\n"
+    "84:"  // Oddments: Load input (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 87f\n"
+    "85:"  // Oddments: Load input (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 86f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 87f\n"
+    "86:"  // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "87:"  // Oddments: Load input (2, 0): Bit 2: End
+    "ldr x20, [x16, #0xc0]\n"
+    "fmla v16.8h, v6.8h, v10.8h\n"
+    "fmla v20.8h, v3.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v24.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 89f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 88f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 91f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 91f\n"
+    "88:"  // Oddments: Load input (3, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 91f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 91f\n"
+    "89:"  // Oddments: Load input (3, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 90f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 91f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 91f\n"
+    "90:"  // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "91:"  // Oddments: Load input (3, 4): Bit 2: End
+    "ldr x20, [x16, #0xc8]\n"
+    "fmla v22.8h, v8.8h, v11.8h\n"
+    "fmla v23.8h, v7.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v26.8h, v5.8h, v11.8h\n"
+    "fmla v27.8h, v4.8h, v11.8h\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v31.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 93f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 92f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 95f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 95f\n"
+    "92:"  // Oddments: Load input (2, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 95f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 95f\n"
+    "93:"  // Oddments: Load input (2, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 94f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 95f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 95f\n"
+    "94:"  // Oddments: Load input (2, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "95:"  // Oddments: Load input (2, 5): Bit 2: End
+    "ldr x20, [x16, #0xd0]\n"
+    "fmla v19.8h, v8.8h, v12.8h\n"
+    "fmla v23.8h, v5.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v27.8h, v2.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 97f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 96f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 99f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 99f\n"
+    "96:"  // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 99f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 99f\n"
+    "97:"  // Oddments: Load input (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 98f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 99f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 99f\n"
+    "98:"  // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "99:"  // Oddments: Load input (3, 0): Bit 2: End
+    "ldr x20, [x16, #0xd8]\n"
+    "fmla v20.8h, v6.8h, v10.8h\n"
+    "fmla v24.8h, v3.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 101f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 100f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 103f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 103f\n"
+    "100:"  // Oddments: Load input (4, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 103f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 103f\n"
+    "101:"  // Oddments: Load input (4, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 102f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 103f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 103f\n"
+    "102:"  // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "103:"  // Oddments: Load input (4, 2): Bit 2: End
+    "ldr x20, [x16, #0xe0]\n"
+    "fmla v24.8h, v8.8h, v11.8h\n"
+    "fmla v25.8h, v7.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v26.8h, v6.8h, v11.8h\n"
+    "fmla v28.8h, v5.8h, v11.8h\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 105f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 104f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 107f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 107f\n"
+    "104:"  // Oddments: Load input (3, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 107f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 107f\n"
+    "105:"  // Oddments: Load input (3, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 106f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 107f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 107f\n"
+    "106:"  // Oddments: Load input (3, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "107:"  // Oddments: Load input (3, 5): Bit 2: End
+    "ldr x20, [x16, #0xe8]\n"
+    "fmla v23.8h, v8.8h, v12.8h\n"
+    "fmla v27.8h, v5.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v31.8h, v2.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 109f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 108f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 111f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 111f\n"
+    "108:"  // Oddments: Load input (5, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 111f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 111f\n"
+    "109:"  // Oddments: Load input (5, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 110f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 111f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 111f\n"
+    "110:"  // Oddments: Load input (5, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "111:"  // Oddments: Load input (5, 2): Bit 2: End
+    "ldr x20, [x16, #0xf0]\n"
+    "fmla v28.8h, v8.8h, v10.8h\n"
+    "fmla v29.8h, v7.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v30.8h, v6.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 113f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 112f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 115f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 115f\n"
+    "112:"  // Oddments: Load input (4, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 115f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 115f\n"
+    "113:"  // Oddments: Load input (4, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 114f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 115f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 115f\n"
+    "114:"  // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "115:"  // Oddments: Load input (4, 3): Bit 2: End
+    "ldr x20, [x16, #0xf8]\n"
+    "fmla v25.8h, v8.8h, v11.8h\n"
+    "fmla v26.8h, v7.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v27.8h, v6.8h, v11.8h\n"
+    "fmla v29.8h, v5.8h, v11.8h\n"
+    "fmla v30.8h, v4.8h, v11.8h\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 117f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 116f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 119f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 119f\n"
+    "116:"  // Oddments: Load input (5, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 119f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 119f\n"
+    "117:"  // Oddments: Load input (5, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 118f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 119f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 119f\n"
+    "118:"  // Oddments: Load input (5, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "119:"  // Oddments: Load input (5, 3): Bit 2: End
+    "ldr x20, [x16, #0x100]\n"
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "fmla v30.8h, v7.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v31.8h, v6.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 121f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 120f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 123f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 123f\n"
+    "120:"  // Oddments: Load input (1, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 123f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 123f\n"
+    "121:"  // Oddments: Load input (1, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 122f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 123f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 123f\n"
+    "122:"  // Oddments: Load input (1, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "123:"  // Oddments: Load input (1, 1): Bit 2: End
+    "ldr x20, [x16, #0x108]\n"
+    "fmla v16.8h, v4.8h, v10.8h\n"
+    "fmla v17.8h, v3.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v20.8h, v1.8h, v10.8h\n"
+    "fmla v21.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 125f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 124f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 127f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 127f\n"
+    "124:"  // Oddments: Load input (1, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 127f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 127f\n"
+    "125:"  // Oddments: Load input (1, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 126f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 127f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 127f\n"
+    "126:"  // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "127:"  // Oddments: Load input (1, 4): Bit 2: End
+    "ldr x20, [x16, #0x110]\n"
+    "fmla v18.8h, v5.8h, v11.8h\n"
+    "fmla v19.8h, v4.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v22.8h, v2.8h, v11.8h\n"
+    "fmla v23.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 129f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 128f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 131f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 131f\n"
+    "128:"  // Oddments: Load input (4, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 131f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 131f\n"
+    "129:"  // Oddments: Load input (4, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 130f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 131f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 131f\n"
+    "130:"  // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "131:"  // Oddments: Load input (4, 1): Bit 2: End
+    "ldr x20, [x16, #0x118]\n"
+    "fmla v24.8h, v7.8h, v12.8h\n"
+    "fmla v25.8h, v6.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v28.8h, v4.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 133f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 132f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 135f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 135f\n"
+    "132:"  // Oddments: Load input (4, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 135f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 135f\n"
+    "133:"  // Oddments: Load input (4, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 134f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 135f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 135f\n"
+    "134:"  // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "135:"  // Oddments: Load input (4, 4): Bit 2: End
+    "fmla v26.8h, v8.8h, v10.8h\n"
+    "fmla v27.8h, v7.8h, v10.8h\n"
+    "fmax v16.8h, v16.8h, v13.8h\n"
+    "fmla v30.8h, v5.8h, v10.8h\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "fmax v17.8h, v17.8h, v13.8h\n"
+    "fmax v18.8h, v18.8h, v13.8h\n"
+    "fmax v19.8h, v19.8h, v13.8h\n"
+    "fmax v20.8h, v20.8h, v13.8h\n"
+    "fmax v21.8h, v21.8h, v13.8h\n"
+    "fmax v22.8h, v22.8h, v13.8h\n"
+    "fmax v23.8h, v23.8h, v13.8h\n"
+    "fmax v24.8h, v24.8h, v13.8h\n"
+    "fmax v25.8h, v25.8h, v13.8h\n"
+    "fmax v26.8h, v26.8h, v13.8h\n"
+    "fmax v27.8h, v27.8h, v13.8h\n"
+    "fmax v28.8h, v28.8h, v13.8h\n"
+    "fmax v29.8h, v29.8h, v13.8h\n"
+    "fmax v30.8h, v30.8h, v13.8h\n"
+    "fmax v31.8h, v31.8h, v13.8h\n"
+    "fmin v16.8h, v16.8h, v14.8h\n"
+    "fmin v17.8h, v17.8h, v14.8h\n"
+    "fmin v18.8h, v18.8h, v14.8h\n"
+    "fmin v19.8h, v19.8h, v14.8h\n"
+    "fmin v20.8h, v20.8h, v14.8h\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "fmin v22.8h, v22.8h, v14.8h\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "fmin v30.8h, v30.8h, v14.8h\n"
+    "fmin v31.8h, v31.8h, v14.8h\n"
+    "tbz %x[n_channels], #2, 137f\n"
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.d }[0], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.d }[0], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.d }[0], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.d }[0], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.d }[0], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.d }[0], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.d }[0], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.d }[0], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.d }[0], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.d }[0], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.d }[0], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.d }[0], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "add x14, x14, #0x8\n"
+    "st1 { v28.d }[0], [x23]\n"
+    "st1 { v29.d }[0], [x22]\n"
+    "st1 { v30.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #1, 136f\n"
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.s }[2], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.s }[2], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.s }[2], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.s }[2], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.s }[2], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.s }[2], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.s }[2], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.s }[2], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.s }[2], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.s }[2], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.s }[2], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.s }[2], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "add x14, x14, #0x4\n"
+    "st1 { v28.s }[2], [x23]\n"
+    "st1 { v29.s }[2], [x22]\n"
+    "st1 { v30.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "tbz %x[n_channels], #0, 139f\n"
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.h }[6], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.h }[6], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.h }[6], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.h }[6], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.h }[6], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.h }[6], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.h }[6], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.h }[6], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.h }[6], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.h }[6], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.h }[6], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.h }[6], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "st1 { v28.h }[6], [x23]\n"
+    "st1 { v29.h }[6], [x22]\n"
+    "st1 { v30.h }[6], [x21]\n"
+    "st1 { v31.h }[6], [x20]\n"
+    "b 139f\n"
+    "136:"  // Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 139f\n"
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.h }[4], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.h }[4], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.h }[4], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.h }[4], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.h }[4], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.h }[4], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.h }[4], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.h }[4], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.h }[4], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.h }[4], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.h }[4], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.h }[4], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "st1 { v28.h }[4], [x23]\n"
+    "st1 { v29.h }[4], [x22]\n"
+    "st1 { v30.h }[4], [x21]\n"
+    "st1 { v31.h }[4], [x20]\n"
+    "b 139f\n"
+    "137:"  // Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 138f\n"
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.s }[0], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.s }[0], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.s }[0], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.s }[0], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.s }[0], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.s }[0], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.s }[0], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.s }[0], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.s }[0], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.s }[0], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.s }[0], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.s }[0], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "add x14, x14, #0x4\n"
+    "st1 { v28.s }[0], [x23]\n"
+    "st1 { v29.s }[0], [x22]\n"
+    "st1 { v30.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 139f\n"
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.h }[2], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.h }[2], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.h }[2], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.h }[2], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.h }[2], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.h }[2], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.h }[2], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.h }[2], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.h }[2], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.h }[2], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.h }[2], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.h }[2], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "st1 { v28.h }[2], [x23]\n"
+    "st1 { v29.h }[2], [x22]\n"
+    "st1 { v30.h }[2], [x21]\n"
+    "st1 { v31.h }[2], [x20]\n"
+    "b 139f\n"
+    "138:"  // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.h }[0], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.h }[0], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.h }[0], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.h }[0], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.h }[0], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.h }[0], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.h }[0], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.h }[0], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.h }[0], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.h }[0], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.h }[0], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.h }[0], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "st1 { v28.h }[0], [x23]\n"
+    "st1 { v29.h }[0], [x22]\n"
+    "st1 { v30.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "139:"  // Oddments: Store: Bit 2: End
+    "140:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..8ad6a37fea
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..8954999990
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,895 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x23, #0x0\n"
+    "mov x27, #0x0\n"
+    "1:"  // Tile loop
+    "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x26, #0x4\n"
+    "mov x25, #0x2\n"
+    "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x23, x24\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x27, x6, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x7, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "lsl x6, x6, #0x1\n"
+    "mul x20, x23, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "mov x23, #0x10\n"  // cntb _, ALL, #1
+    "mul x22, x22, x26\n"  // offset *= kernel_stride * output_size
+    "add x8, x8, x22, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x16, x8, x24, LSL #1\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x27, x7, x20\n"  // offset += tile_j * ld_output_col
+    "lsr x22, %x[n_channels], #0x3\n"
+    "add x14, x16, x24, LSL #1\n"
+    "mul x20, x20, x25\n"  // offset *= output_tile_size
+    "add x13, x6, x6\n"
+    "add x12, x14, x24, LSL #1\n"
+    "add x11, x13, x6\n"
+    "add x17, x17, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v26.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v27.8h }, [x20]\n"
+    "add x10, x12, x24, LSL #1\n"
+    "add x9, x11, x6\n"
+    "add x28, x17, x21, LSL #1\n"
+    "lsl x7, x7, #0x1\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x23\n"
+    "cbz x22, 4f\n"
+    "ldr q31, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "cmp x23, x22, LSL #4\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "ldr q9, [x14, x13]\n"
+    "ld1 { v10.8h }, [x8]\n"
+    "ldr q11, [x8, x6]\n"
+    "ldr q12, [x8, x11]\n"
+    "ldr q13, [x8, x9]\n"
+    "ld1 { v14.8h }, [x16]\n"
+    "ldr q15, [x16, x6]\n"
+    "ldr q16, [x8, x13]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+    "mov v28.16b, v31.16b\n fmla v28.8h, v6.8h, v9.8h\n"
+    "add x23, x23, #0x10\n"
+    "add x8, x8, #0x10\n"
+    "fmla v29.8h, v0.8h, v10.8h\n"
+    "ld1 { v10.8h }, [x8]\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "ldr q21, [x16, x9]\n"
+    "fmla v29.8h, v1.8h, v11.8h\n"
+    "ldr q18, [x16, x11]\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "ldr q17, [x16, x13]\n"
+    "fmla v29.8h, v3.8h, v14.8h\n"
+    "ld1 { v20.8h }, [x12]\n"
+    "fmla v28.8h, v0.8h, v16.8h\n"
+    "add x16, x16, #0x10\n"
+    "fmla v29.8h, v4.8h, v15.8h\n"
+    "ld1 { v25.8h }, [x14]\n"
+    "fmla v28.8h, v4.8h, v18.8h\n"
+    "ldr q19, [x12, x6]\n"
+    "fmla v29.8h, v2.8h, v16.8h\n"
+    "ldr q18, [x14, x6]\n"
+    "fmla v28.8h, v5.8h, v21.8h\n"
+    "ldr q24, [x14, x11]\n"
+    "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+    "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
+    "ldr q31, [x15, #0x0]\n"
+    "cmp x23, x22, LSL #4\n"
+    "fmla v29.8h, v5.8h, v17.8h\n"
+    "fmla v28.8h, v3.8h, v17.8h\n"
+    "ldr q17, [x12, x11]\n"
+    "add x20, x20, #0x10\n"
+    "fmla v23.8h, v3.8h, v20.8h\n"
+    "ldr q16, [x12, x9]\n"
+    "fmla v22.8h, v4.8h, v17.8h\n"
+    "ldr q21, [x10, x6]\n"
+    "fmla v23.8h, v0.8h, v25.8h\n"
+    "ldr q0, [x15, #0x10]\n"
+    "fmla v22.8h, v1.8h, v24.8h\n"
+    "add x21, x21, #0x10\n"
+    "fmla v23.8h, v4.8h, v19.8h\n"
+    "ldr q20, [x14, x9]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "fmla v22.8h, v5.8h, v16.8h\n"
+    "ldr q19, [x10, x11]\n"
+    "fmla v29.8h, v6.8h, v25.8h\n"
+    "ld1 { v17.8h }, [x10]\n"
+    "fmla v23.8h, v1.8h, v18.8h\n"
+    "ldr q1, [x15, #0x20]\n"
+    "fmla v22.8h, v2.8h, v20.8h\n"
+    "ldr q2, [x15, #0x30]\n"
+    "fmla v29.8h, v7.8h, v18.8h\n"
+    "ldr q16, [x12, x13]\n"
+    "fmla v23.8h, v6.8h, v17.8h\n"
+    "ldr q18, [x10, x13]\n"
+    "fmla v22.8h, v3.8h, v16.8h\n"
+    "ldr q3, [x15, #0x40]\n"
+    "fmla v23.8h, v7.8h, v21.8h\n"
+    "ldr q13, [x8, x9]\n"
+    "fmla v22.8h, v7.8h, v19.8h\n"
+    "ld1 { v14.8h }, [x16]\n"
+    "fmla v28.8h, v7.8h, v24.8h\n"
+    "ldr q12, [x8, x11]\n"
+    "fmla v23.8h, v5.8h, v16.8h\n"
+    "ldr q16, [x8, x13]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "fmla v22.8h, v6.8h, v18.8h\n"
+    "fmla v28.8h, v8.8h, v20.8h\n"
+    "ldr q17, [x10, x9]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "fmla v23.8h, v8.8h, v18.8h\n"
+    "fmla v22.8h, v8.8h, v17.8h\n"
+    "ldr q11, [x8, x6]\n"
+    "ldr q15, [x16, x6]\n"
+    "fmax v29.8h, v29.8h, v26.8h\n"
+    "fmax v28.8h, v28.8h, v26.8h\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "fmax v23.8h, v23.8h, v26.8h\n"
+    "fmax v22.8h, v22.8h, v26.8h\n"
+    "add x14, x14, #0x10\n"
+    "ldr q9, [x14, x13]\n"
+    "fmin v29.8h, v29.8h, v27.8h\n"
+    "fmin v28.8h, v28.8h, v27.8h\n"
+    "fmin v23.8h, v23.8h, v27.8h\n"
+    "fmin v22.8h, v22.8h, v27.8h\n"
+    "add x12, x12, #0x10\n"
+    "add x10, x10, #0x10\n"
+    "st1 { v29.8h }, [x17]\n"
+    "add x15, x15, #0xa0\n"
+    "str q28, [x17, x7]\n"
+    "add x17, x17, #0x10\n"
+    "st1 { v23.8h }, [x28]\n"
+    "str q22, [x28, x7]\n"
+    "add x28, x28, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+    "mov v28.16b, v31.16b\n fmla v28.8h, v6.8h, v9.8h\n"
+    "add x8, x8, #0x10\n"
+    "fmla v29.8h, v0.8h, v10.8h\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "ldr q20, [x16, x9]\n"
+    "fmla v29.8h, v1.8h, v11.8h\n"
+    "ldr q18, [x16, x11]\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "ldr q17, [x16, x13]\n"
+    "fmla v29.8h, v3.8h, v14.8h\n"
+    "ld1 { v19.8h }, [x12]\n"
+    "fmla v28.8h, v0.8h, v16.8h\n"
+    "add x16, x16, #0x10\n"
+    "fmla v29.8h, v4.8h, v15.8h\n"
+    "ld1 { v25.8h }, [x14]\n"
+    "fmla v28.8h, v4.8h, v18.8h\n"
+    "ldr q18, [x12, x6]\n"
+    "fmla v29.8h, v2.8h, v16.8h\n"
+    "ldr q24, [x14, x6]\n"
+    "fmla v28.8h, v5.8h, v20.8h\n"
+    "ldr q23, [x14, x11]\n"
+    "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
+    "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "fmla v29.8h, v5.8h, v17.8h\n"
+    "fmla v28.8h, v3.8h, v17.8h\n"
+    "ldr q17, [x12, x11]\n"
+    "fmla v22.8h, v3.8h, v19.8h\n"
+    "ldr q16, [x12, x9]\n"
+    "fmla v21.8h, v4.8h, v17.8h\n"
+    "ldr q20, [x10, x6]\n"
+    "fmla v22.8h, v0.8h, v25.8h\n"
+    "fmla v21.8h, v1.8h, v23.8h\n"
+    "fmla v22.8h, v4.8h, v18.8h\n"
+    "ldr q19, [x14, x9]\n"
+    "fmla v21.8h, v5.8h, v16.8h\n"
+    "ldr q18, [x10, x11]\n"
+    "fmla v29.8h, v6.8h, v25.8h\n"
+    "ld1 { v17.8h }, [x10]\n"
+    "fmla v22.8h, v1.8h, v24.8h\n"
+    "add x14, x14, #0x10\n"
+    "fmla v21.8h, v2.8h, v19.8h\n"
+    "fmla v29.8h, v7.8h, v24.8h\n"
+    "ldr q16, [x12, x13]\n"
+    "fmax v29.8h, v29.8h, v26.8h\n"
+    "fmla v22.8h, v6.8h, v17.8h\n"
+    "ldr q17, [x10, x13]\n"
+    "fmla v21.8h, v3.8h, v16.8h\n"
+    "fmin v29.8h, v29.8h, v27.8h\n"
+    "fmla v22.8h, v7.8h, v20.8h\n"
+    "fmla v21.8h, v7.8h, v18.8h\n"
+    "st1 { v29.8h }, [x17]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v28.8h, v7.8h, v23.8h\n"
+    "fmla v22.8h, v5.8h, v16.8h\n"
+    "fmla v21.8h, v6.8h, v17.8h\n"
+    "fmla v28.8h, v8.8h, v19.8h\n"
+    "ldr q16, [x10, x9]\n"
+    "fmax v28.8h, v28.8h, v26.8h\n"
+    "fmla v22.8h, v8.8h, v17.8h\n"
+    "fmla v21.8h, v8.8h, v16.8h\n"
+    "fmax v22.8h, v22.8h, v26.8h\n"
+    "add x10, x10, #0x10\n"
+    "fmax v21.8h, v21.8h, v26.8h\n"
+    "fmin v28.8h, v28.8h, v27.8h\n"
+    "str q28, [x17, x7]\n"
+    "add x17, x17, #0x10\n"
+    "fmin v22.8h, v22.8h, v27.8h\n"
+    "fmin v21.8h, v21.8h, v27.8h\n"
+    "st1 { v22.8h }, [x28]\n"
+    "str q21, [x28, x7]\n"
+    "add x28, x28, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 81f\n"
+    "ldr q31, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "add x27, x14, x13\n"
+    "add x26, x8, XZR\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "add x25, x8, x6\n"
+    "add x24, x8, x11\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "add x23, x8, x9\n"
+    "add x22, x16, XZR\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "add x21, x16, x6\n"
+    "add x20, x8, x13\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "tbz %x[n_channels], #2, 6f\n"
+    "ldr d9, [x27], #0x8\n"
+    "ldr d10, [x26], #0x8\n"
+    "ldr d11, [x25], #0x8\n"
+    "ldr d12, [x24], #0x8\n"
+    "ldr d13, [x23], #0x8\n"
+    "ldr d14, [x22], #0x8\n"
+    "ldr d15, [x21], #0x8\n"
+    "ldr d16, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v9.s }[2], [x27], #0x4\n"
+    "ld1 { v10.s }[2], [x26], #0x4\n"
+    "ld1 { v11.s }[2], [x25], #0x4\n"
+    "ld1 { v12.s }[2], [x24], #0x4\n"
+    "ld1 { v13.s }[2], [x23], #0x4\n"
+    "ld1 { v14.s }[2], [x22], #0x4\n"
+    "ld1 { v15.s }[2], [x21], #0x4\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[6], [x27]\n"
+    "ld1 { v10.h }[6], [x26]\n"
+    "ld1 { v11.h }[6], [x25]\n"
+    "ld1 { v12.h }[6], [x24]\n"
+    "ld1 { v13.h }[6], [x23]\n"
+    "ld1 { v14.h }[6], [x22]\n"
+    "ld1 { v15.h }[6], [x21]\n"
+    "ld1 { v16.h }[6], [x20]\n"
+    "b 8f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[4], [x27]\n"
+    "ld1 { v10.h }[4], [x26]\n"
+    "ld1 { v11.h }[4], [x25]\n"
+    "ld1 { v12.h }[4], [x24]\n"
+    "ld1 { v13.h }[4], [x23]\n"
+    "ld1 { v14.h }[4], [x22]\n"
+    "ld1 { v15.h }[4], [x21]\n"
+    "ld1 { v16.h }[4], [x20]\n"
+    "b 8f\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr s9, [x27], #0x4\n"
+    "ldr s10, [x26], #0x4\n"
+    "ldr s11, [x25], #0x4\n"
+    "ldr s12, [x24], #0x4\n"
+    "ldr s13, [x23], #0x4\n"
+    "ldr s14, [x22], #0x4\n"
+    "ldr s15, [x21], #0x4\n"
+    "ldr s16, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[2], [x27]\n"
+    "ld1 { v10.h }[2], [x26]\n"
+    "ld1 { v11.h }[2], [x25]\n"
+    "ld1 { v12.h }[2], [x24]\n"
+    "ld1 { v13.h }[2], [x23]\n"
+    "ld1 { v14.h }[2], [x22]\n"
+    "ld1 { v15.h }[2], [x21]\n"
+    "ld1 { v16.h }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x27, #0x0]\n"
+    "ldr h10, [x26, #0x0]\n"
+    "ldr h11, [x25, #0x0]\n"
+    "ldr h12, [x24, #0x0]\n"
+    "ldr h13, [x23, #0x0]\n"
+    "ldr h14, [x22, #0x0]\n"
+    "ldr h15, [x21, #0x0]\n"
+    "ldr h16, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: End
+    "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "add x20, x16, x11\n"
+    "mov v29.16b, v31.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "fmla v28.8h, v3.8h, v14.8h\n"
+    "fmla v29.8h, v2.8h, v13.8h\n"
+    "fmla v28.8h, v4.8h, v15.8h\n"
+    "mov v30.16b, v31.16b\n fmla v30.8h, v2.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v2.8h, v16.8h\n"
+    "fmla v29.8h, v0.8h, v16.8h\n"
+    "tbz %x[n_channels], #2, 10f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 12f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 12f\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "add x20, x16, x9\n"
+    "tbz %x[n_channels], #2, 14f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 16f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 16f\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
+    "fmla v29.8h, v5.8h, v12.8h\n"
+    "add x20, x16, x13\n"
+    "tbz %x[n_channels], #2, 18f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 20f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 20f\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: End
+    "fmla v28.8h, v5.8h, v13.8h\n"
+    "fmla v29.8h, v3.8h, v13.8h\n"
+    "add x20, x12, XZR\n"
+    "tbz %x[n_channels], #2, 22f\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v14.h }[6], [x20]\n"
+    "b 24f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v14.h }[4], [x20]\n"
+    "b 24f\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr s14, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v14.h }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h14, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
+    "fmla v30.8h, v3.8h, v14.8h\n"
+    "add x20, x14, XZR\n"
+    "tbz %x[n_channels], #2, 26f\n"
+    "ldr d15, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v15.h }[6], [x20]\n"
+    "b 28f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v15.h }[4], [x20]\n"
+    "b 28f\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr s15, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v15.h }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h15, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
+    "fmla v28.8h, v6.8h, v15.8h\n"
+    "fmla v30.8h, v0.8h, v15.8h\n"
+    "add x20, x12, x6\n"
+    "tbz %x[n_channels], #2, 30f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 32f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 32f\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
+    "fmla v30.8h, v4.8h, v11.8h\n"
+    "add x20, x14, x6\n"
+    "tbz %x[n_channels], #2, 34f\n"
+    "ldr d16, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v16.h }[6], [x20]\n"
+    "b 36f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v16.h }[4], [x20]\n"
+    "b 36f\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr s16, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v16.h }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h16, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
+    "fmla v28.8h, v7.8h, v16.8h\n"
+    "fmla v30.8h, v1.8h, v16.8h\n"
+    "add x20, x12, x11\n"
+    "tbz %x[n_channels], #2, 38f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 40f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 40f\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
+    "fmla v31.8h, v4.8h, v13.8h\n"
+    "add x20, x14, x11\n"
+    "tbz %x[n_channels], #2, 42f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 44f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 44f\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
+    "fmla v29.8h, v7.8h, v12.8h\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "add x20, x12, x9\n"
+    "tbz %x[n_channels], #2, 46f\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v14.h }[6], [x20]\n"
+    "b 48f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v14.h }[4], [x20]\n"
+    "b 48f\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr s14, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v14.h }[2], [x20]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h14, [x20, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
+    "fmla v31.8h, v5.8h, v14.8h\n"
+    "add x20, x10, XZR\n"
+    "tbz %x[n_channels], #2, 50f\n"
+    "ldr d15, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v15.h }[6], [x20]\n"
+    "b 52f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v15.h }[4], [x20]\n"
+    "b 52f\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr s15, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v15.h }[2], [x20]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h15, [x20, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: End
+    "fmla v30.8h, v6.8h, v15.8h\n"
+    "add x20, x14, x9\n"
+    "tbz %x[n_channels], #2, 54f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 56f\n"
+    "53:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 56f\n"
+    "54:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 55f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "56:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
+    "fmla v29.8h, v8.8h, v11.8h\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "add x20, x10, x6\n"
+    "tbz %x[n_channels], #2, 58f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 57f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 60f\n"
+    "57:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 60f\n"
+    "58:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 59f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 60f\n"
+    "59:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "60:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
+    "fmla v30.8h, v7.8h, v13.8h\n"
+    "add x20, x12, x13\n"
+    "tbz %x[n_channels], #2, 62f\n"
+    "ldr d16, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 61f\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v16.h }[6], [x20]\n"
+    "b 64f\n"
+    "61:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v16.h }[4], [x20]\n"
+    "b 64f\n"
+    "62:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 63f\n"
+    "ldr s16, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v16.h }[2], [x20]\n"
+    "b 64f\n"
+    "63:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h16, [x20, #0x0]\n"
+    "64:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
+    "fmla v30.8h, v5.8h, v16.8h\n"
+    "fmla v31.8h, v3.8h, v16.8h\n"
+    "add x20, x10, x11\n"
+    "tbz %x[n_channels], #2, 66f\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 65f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v14.h }[6], [x20]\n"
+    "b 68f\n"
+    "65:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v14.h }[4], [x20]\n"
+    "b 68f\n"
+    "66:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 67f\n"
+    "ldr s14, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v14.h }[2], [x20]\n"
+    "b 68f\n"
+    "67:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h14, [x20, #0x0]\n"
+    "68:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
+    "fmla v31.8h, v7.8h, v14.8h\n"
+    "add x20, x10, x13\n"
+    "tbz %x[n_channels], #2, 70f\n"
+    "ldr d15, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 69f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v15.h }[6], [x20]\n"
+    "b 72f\n"
+    "69:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v15.h }[4], [x20]\n"
+    "b 72f\n"
+    "70:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 71f\n"
+    "ldr s15, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v15.h }[2], [x20]\n"
+    "b 72f\n"
+    "71:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h15, [x20, #0x0]\n"
+    "72:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
+    "fmla v30.8h, v8.8h, v15.8h\n"
+    "fmla v31.8h, v6.8h, v15.8h\n"
+    "add x20, x10, x9\n"
+    "tbz %x[n_channels], #2, 74f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 73f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 76f\n"
+    "73:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 76f\n"
+    "74:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 75f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 76f\n"
+    "75:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "76:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
+    "fmla v31.8h, v8.8h, v11.8h\n"
+    "fmax v28.8h, v28.8h, v26.8h\n"
+    "fmax v29.8h, v29.8h, v26.8h\n"
+    "fmax v30.8h, v30.8h, v26.8h\n"
+    "fmax v31.8h, v31.8h, v26.8h\n"
+    "fmin v28.8h, v28.8h, v27.8h\n"
+    "fmin v29.8h, v29.8h, v27.8h\n"
+    "fmin v30.8h, v30.8h, v27.8h\n"
+    "fmin v31.8h, v31.8h, v27.8h\n"
+    "tbz %x[n_channels], #2, 78f\n"
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.d }[0], [x21], x7\n"
+    "st1 { v30.d }[0], [x20], x7\n"
+    "add x17, x17, #0x8\n"
+    "add x28, x28, #0x8\n"
+    "st1 { v29.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #1, 77f\n"
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.s }[2], [x21], x7\n"
+    "st1 { v30.s }[2], [x20], x7\n"
+    "add x17, x17, #0x4\n"
+    "add x28, x28, #0x4\n"
+    "st1 { v29.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "tbz %x[n_channels], #0, 80f\n"
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.h }[6], [x21], x7\n"
+    "st1 { v30.h }[6], [x20], x7\n"
+    "st1 { v29.h }[6], [x21]\n"
+    "st1 { v31.h }[6], [x20]\n"
+    "b 80f\n"
+    "77:"  // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 80f\n"
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.h }[4], [x21], x7\n"
+    "st1 { v30.h }[4], [x20], x7\n"
+    "st1 { v29.h }[4], [x21]\n"
+    "st1 { v31.h }[4], [x20]\n"
+    "b 80f\n"
+    "78:"  // Tile loop: Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 79f\n"
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.s }[0], [x21], x7\n"
+    "st1 { v30.s }[0], [x20], x7\n"
+    "add x17, x17, #0x4\n"
+    "add x28, x28, #0x4\n"
+    "st1 { v29.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 80f\n"
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.h }[2], [x21], x7\n"
+    "st1 { v30.h }[2], [x20], x7\n"
+    "st1 { v29.h }[2], [x21]\n"
+    "st1 { v31.h }[2], [x20]\n"
+    "b 80f\n"
+    "79:"  // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.h }[0], [x21], x7\n"
+    "st1 { v30.h }[0], [x20], x7\n"
+    "st1 { v29.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "80:"  // Tile loop: Oddments: Store: Bit 2: End
+    "81:"  // Tile loop: End
+    "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x27, x27, #0x1\n"
+    "add x21, x23, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x27, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x23, x23, x21, LT\n"
+    "csel x27, x27, XZR, LT\n"
+    "cmp x23, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..6ae0b30afd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,897 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[25];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[1];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[2];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[7];
+      inptrs[11] = input_ptrs[15];
+      inptrs[12] = input_ptrs[10];
+      inptrs[13] = input_ptrs[16];
+      inptrs[14] = input_ptrs[11];
+      inptrs[15] = input_ptrs[18];
+      inptrs[16] = input_ptrs[13];
+      inptrs[17] = input_ptrs[19];
+      inptrs[18] = input_ptrs[20];
+      inptrs[19] = input_ptrs[14];
+      inptrs[20] = input_ptrs[21];
+      inptrs[21] = input_ptrs[17];
+      inptrs[22] = input_ptrs[23];
+      inptrs[23] = input_ptrs[22];
+      inptrs[24] = input_ptrs[24];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "mov x25, #0x10\n"  // cntb _, ALL, #1
+    "lsr x24, %x[n_channels], #0x3\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v26.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v27.8h }, [x20]\n"
+    "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldp x12, x11, [x21, #0x0]\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "mov x28, #0x0\n"
+    "sub x22, XZR, x25\n"
+    "cbz x24, 3f\n"
+    "ldr q31, [x23, #0x0]\n"
+    "ldr q0, [x23, #0x10]\n"
+    "cmp x25, x24, LSL #4\n"
+    "ldr q1, [x23, #0x20]\n"
+    "ldr q2, [x23, #0x30]\n"
+    "ldr q3, [x23, #0x40]\n"
+    "ldr q4, [x23, #0x50]\n"
+    "ldr q5, [x23, #0x60]\n"
+    "ldr q6, [x23, #0x70]\n"
+    "ldr q7, [x23, #0x80]\n"
+    "ldr q8, [x23, #0x90]\n"
+    "add x23, x23, #0xa0\n"
+    "ldp x21, x20, [x13, #0x0]\n"
+    "ldr q9, [x21, x28]\n"
+    "ldr q10, [x20, x28]\n"
+    "ldp x21, x20, [x13, #0x10]\n"
+    "ldr q11, [x21, x28]\n"
+    "ldr q12, [x20, x28]\n"
+    "ldp x21, x20, [x13, #0x20]\n"
+    "ldr q13, [x21, x28]\n"
+    "ldr q14, [x20, x28]\n"
+    "ldp x21, x20, [x13, #0x30]\n"
+    "ldr q15, [x21, x28]\n"
+    "ldr q16, [x20, x28]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v24.16b, v31.16b\n fmla v24.8h, v8.8h, v9.8h\n"
+    "mov v23.16b, v31.16b\n fmla v23.8h, v6.8h, v9.8h\n"
+    "ldr x21, [x13, #0x40]\n"
+    "ldr x20, [x13, #0x48]\n"
+    "fmla v24.8h, v0.8h, v10.8h\n"
+    "fmla v23.8h, v1.8h, v12.8h\n"
+    "ldr q20, [x20, x28]\n"
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v24.8h, v1.8h, v11.8h\n"
+    "ldr q19, [x21, x28]\n"
+    "fmla v23.8h, v2.8h, v13.8h\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v24.8h, v3.8h, v14.8h\n"
+    "fmla v23.8h, v0.8h, v16.8h\n"
+    "ldr x20, [x13, #0x58]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v24.8h, v4.8h, v15.8h\n"
+    "fmla v23.8h, v4.8h, v19.8h\n"
+    "ldr x21, [x13, #0x78]\n"
+    "ldr x20, [x13, #0x60]\n"
+    "ldr q22, [x20, x28]\n"
+    "fmla v24.8h, v2.8h, v16.8h\n"
+    "fmla v23.8h, v5.8h, v20.8h\n"
+    "ldr x20, [x13, #0x80]\n"
+    "ldr q21, [x20, x28]\n"
+    "mov v20.16b, v31.16b\n fmla v20.8h, v2.8h, v9.8h\n"
+    "mov v19.16b, v31.16b\n fmla v19.8h, v0.8h, v9.8h\n"
+    "ldr q31, [x23, #0x0]\n"
+    "fmla v24.8h, v5.8h, v18.8h\n"
+    "fmla v23.8h, v3.8h, v18.8h\n"
+    "ldr q16, [x21, x28]\n"
+    "ldr x20, [x13, #0x68]\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v20.8h, v3.8h, v17.8h\n"
+    "fmla v19.8h, v4.8h, v16.8h\n"
+    "ldr x20, [x13, #0x88]\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v20.8h, v0.8h, v22.8h\n"
+    "ldr q0, [x23, #0x10]\n"
+    "fmla v19.8h, v1.8h, v21.8h\n"
+    "ldr x20, [x13, #0x70]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v20.8h, v4.8h, v18.8h\n"
+    "fmla v19.8h, v5.8h, v16.8h\n"
+    "ldr q4, [x23, #0x50]\n"
+    "ldr x20, [x13, #0x98]\n"
+    "fmla v24.8h, v6.8h, v22.8h\n"
+    "fmla v20.8h, v1.8h, v17.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr q1, [x23, #0x20]\n"
+    "fmla v19.8h, v2.8h, v16.8h\n"
+    "fmla v24.8h, v7.8h, v17.8h\n"
+    "ldr q2, [x23, #0x30]\n"
+    "ldr x20, [x13, #0x90]\n"
+    "fmla v23.8h, v7.8h, v21.8h\n"
+    "fmla v23.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr x20, [x13, #0xa8]\n"
+    "fmla v20.8h, v6.8h, v16.8h\n"
+    "fmax v24.8h, v24.8h, v26.8h\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x20, [x13, #0xa0]\n"
+    "fmla v19.8h, v3.8h, v17.8h\n"
+    "fmax v23.8h, v23.8h, v26.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr q3, [x23, #0x40]\n"
+    "fmla v20.8h, v7.8h, v16.8h\n"
+    "fmla v20.8h, v5.8h, v17.8h\n"
+    "ldr q5, [x23, #0x60]\n"
+    "ldr x20, [x13, #0xb0]\n"
+    "add x22, x22, #0x10\n"
+    "fmin v24.8h, v24.8h, v27.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr x20, [x13, #0xb8]\n"
+    "fmla v19.8h, v7.8h, v16.8h\n"
+    "fmin v23.8h, v23.8h, v27.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr q7, [x23, #0x80]\n"
+    "fmla v19.8h, v6.8h, v16.8h\n"
+    "fmla v20.8h, v8.8h, v16.8h\n"
+    "ldr q6, [x23, #0x70]\n"
+    "ldr x20, [x13, #0xc0]\n"
+    "fmax v20.8h, v20.8h, v26.8h\n"
+    "fmin v20.8h, v20.8h, v27.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v19.8h, v8.8h, v16.8h\n"
+    "ldr q8, [x23, #0x90]\n"
+    "fmax v19.8h, v19.8h, v26.8h\n"
+    "ldp x21, x20, [x13, #0x0]\n"
+    "ldr q9, [x21, x25]\n"
+    "fmin v19.8h, v19.8h, v27.8h\n"
+    "add x28, x28, #0x10\n"
+    "ldr q10, [x20, x25]\n"
+    "ldp x21, x20, [x13, #0x10]\n"
+    "str q24, [x12, x22]\n"
+    "add x23, x23, #0xa0\n"
+    "ldr q11, [x21, x25]\n"
+    "ldr q12, [x20, x25]\n"
+    "str q23, [x11, x22]\n"
+    "ldp x21, x20, [x13, #0x20]\n"
+    "ldr q13, [x21, x25]\n"
+    "str q20, [x10, x22]\n"
+    "ldr q14, [x20, x25]\n"
+    "ldp x21, x20, [x13, #0x30]\n"
+    "str q19, [x9, x22]\n"
+    "ldr q15, [x21, x25]\n"
+    "ldr q16, [x20, x25]\n"
+    "add x25, x25, #0x10\n"
+    "cmp x25, x24, LSL #4\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v25.16b, v31.16b\n fmla v25.8h, v8.8h, v9.8h\n"
+    "mov v24.16b, v31.16b\n fmla v24.8h, v6.8h, v9.8h\n"
+    "ldr x21, [x13, #0x40]\n"
+    "ldr x20, [x13, #0x48]\n"
+    "fmla v25.8h, v0.8h, v10.8h\n"
+    "fmla v24.8h, v1.8h, v12.8h\n"
+    "ldr q20, [x20, x28]\n"
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v25.8h, v1.8h, v11.8h\n"
+    "ldr q18, [x21, x28]\n"
+    "fmla v24.8h, v2.8h, v13.8h\n"
+    "ldr q19, [x20, x28]\n"
+    "fmla v25.8h, v3.8h, v14.8h\n"
+    "fmla v24.8h, v0.8h, v16.8h\n"
+    "ldr x20, [x13, #0x58]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v25.8h, v4.8h, v15.8h\n"
+    "fmla v24.8h, v4.8h, v18.8h\n"
+    "ldr x21, [x13, #0x78]\n"
+    "ldr x20, [x13, #0x60]\n"
+    "ldr q23, [x20, x28]\n"
+    "fmla v25.8h, v2.8h, v16.8h\n"
+    "fmla v24.8h, v5.8h, v20.8h\n"
+    "ldr x20, [x13, #0x80]\n"
+    "ldr q22, [x20, x28]\n"
+    "mov v21.16b, v31.16b\n fmla v21.8h, v2.8h, v9.8h\n"
+    "mov v20.16b, v31.16b\n fmla v20.8h, v0.8h, v9.8h\n"
+    "ldr x20, [x13, #0x68]\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v25.8h, v5.8h, v19.8h\n"
+    "fmla v24.8h, v3.8h, v19.8h\n"
+    "ldr q16, [x21, x28]\n"
+    "fmla v21.8h, v3.8h, v17.8h\n"
+    "fmla v20.8h, v4.8h, v16.8h\n"
+    "ldr x20, [x13, #0x88]\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v21.8h, v0.8h, v23.8h\n"
+    "fmla v20.8h, v1.8h, v22.8h\n"
+    "ldr x20, [x13, #0x70]\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x20, [x13, #0x98]\n"
+    "fmla v21.8h, v4.8h, v18.8h\n"
+    "ldr q19, [x20, x28]\n"
+    "fmla v20.8h, v5.8h, v16.8h\n"
+    "fmla v25.8h, v6.8h, v23.8h\n"
+    "ldr x20, [x13, #0x90]\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v21.8h, v1.8h, v17.8h\n"
+    "ldr x20, [x13, #0xa8]\n"
+    "fmla v20.8h, v2.8h, v19.8h\n"
+    "fmla v25.8h, v7.8h, v17.8h\n"
+    "ldr q18, [x20, x28]\n"
+    "ldr x20, [x13, #0xa0]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v21.8h, v6.8h, v16.8h\n"
+    "fmla v20.8h, v3.8h, v18.8h\n"
+    "ldr x20, [x13, #0xb0]\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v21.8h, v7.8h, v17.8h\n"
+    "fmla v20.8h, v7.8h, v16.8h\n"
+    "ldr x20, [x13, #0xb8]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v24.8h, v7.8h, v22.8h\n"
+    "fmla v21.8h, v5.8h, v18.8h\n"
+    "ldr x20, [x13, #0xc0]\n"
+    "fmla v20.8h, v6.8h, v17.8h\n"
+    "fmla v24.8h, v8.8h, v19.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v21.8h, v8.8h, v17.8h\n"
+    "fmla v20.8h, v8.8h, v16.8h\n"
+    "fmax v25.8h, v25.8h, v26.8h\n"
+    "add x22, x22, #0x10\n"
+    "fmax v24.8h, v24.8h, v26.8h\n"
+    "fmax v21.8h, v21.8h, v26.8h\n"
+    "add x28, x28, #0x10\n"
+    "fmax v20.8h, v20.8h, v26.8h\n"
+    "fmin v25.8h, v25.8h, v27.8h\n"
+    "str q25, [x12, x22]\n"
+    "fmin v24.8h, v24.8h, v27.8h\n"
+    "fmin v21.8h, v21.8h, v27.8h\n"
+    "str q24, [x11, x22]\n"
+    "fmin v20.8h, v20.8h, v27.8h\n"
+    "str q21, [x10, x22]\n"
+    "str q20, [x9, x22]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 80f\n"
+    "ldr q31, [x23, #0x0]\n"
+    "ldr q0, [x23, #0x10]\n"
+    "mov x20, x28\n"
+    "add x12, x12, x20\n"
+    "ldr q1, [x23, #0x20]\n"
+    "ldr q2, [x23, #0x30]\n"
+    "add x11, x11, x20\n"
+    "add x10, x10, x20\n"
+    "ldr q3, [x23, #0x40]\n"
+    "ldr q4, [x23, #0x50]\n"
+    "add x9, x9, x20\n"
+    "ldr q5, [x23, #0x60]\n"
+    "ldr q6, [x23, #0x70]\n"
+    "ldr q7, [x23, #0x80]\n"
+    "ldr q8, [x23, #0x90]\n"
+    "ldr x27, [x13, #0x0]\n"
+    "ldr x26, [x13, #0x8]\n"
+    "add x27, x27, x28\n"
+    "add x26, x26, x28\n"
+    "ldr x25, [x13, #0x10]\n"
+    "ldr x24, [x13, #0x18]\n"
+    "add x25, x25, x28\n"
+    "add x24, x24, x28\n"
+    "ldr x23, [x13, #0x20]\n"
+    "ldr x22, [x13, #0x28]\n"
+    "add x23, x23, x28\n"
+    "add x22, x22, x28\n"
+    "ldr x21, [x13, #0x30]\n"
+    "ldr x20, [x13, #0x38]\n"
+    "add x21, x21, x28\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 5f\n"
+    "ld1 { v9.d }[0], [x27], #0x8\n"
+    "ld1 { v10.d }[0], [x26], #0x8\n"
+    "ld1 { v11.d }[0], [x25], #0x8\n"
+    "ld1 { v12.d }[0], [x24], #0x8\n"
+    "ld1 { v13.d }[0], [x23], #0x8\n"
+    "ld1 { v14.d }[0], [x22], #0x8\n"
+    "ld1 { v15.d }[0], [x21], #0x8\n"
+    "ld1 { v16.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.s }[2], [x27], #0x4\n"
+    "ld1 { v10.s }[2], [x26], #0x4\n"
+    "ld1 { v11.s }[2], [x25], #0x4\n"
+    "ld1 { v12.s }[2], [x24], #0x4\n"
+    "ld1 { v13.s }[2], [x23], #0x4\n"
+    "ld1 { v14.s }[2], [x22], #0x4\n"
+    "ld1 { v15.s }[2], [x21], #0x4\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[6], [x27], #0x2\n"
+    "ld1 { v10.h }[6], [x26], #0x2\n"
+    "ld1 { v11.h }[6], [x25], #0x2\n"
+    "ld1 { v12.h }[6], [x24], #0x2\n"
+    "ld1 { v13.h }[6], [x23], #0x2\n"
+    "ld1 { v14.h }[6], [x22], #0x2\n"
+    "ld1 { v15.h }[6], [x21], #0x2\n"
+    "ld1 { v16.h }[6], [x20], #0x2\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[4], [x27], #0x2\n"
+    "ld1 { v10.h }[4], [x26], #0x2\n"
+    "ld1 { v11.h }[4], [x25], #0x2\n"
+    "ld1 { v12.h }[4], [x24], #0x2\n"
+    "ld1 { v13.h }[4], [x23], #0x2\n"
+    "ld1 { v14.h }[4], [x22], #0x2\n"
+    "ld1 { v15.h }[4], [x21], #0x2\n"
+    "ld1 { v16.h }[4], [x20], #0x2\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v9.s }[0], [x27], #0x4\n"
+    "ld1 { v10.s }[0], [x26], #0x4\n"
+    "ld1 { v11.s }[0], [x25], #0x4\n"
+    "ld1 { v12.s }[0], [x24], #0x4\n"
+    "ld1 { v13.s }[0], [x23], #0x4\n"
+    "ld1 { v14.s }[0], [x22], #0x4\n"
+    "ld1 { v15.s }[0], [x21], #0x4\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[2], [x27], #0x2\n"
+    "ld1 { v10.h }[2], [x26], #0x2\n"
+    "ld1 { v11.h }[2], [x25], #0x2\n"
+    "ld1 { v12.h }[2], [x24], #0x2\n"
+    "ld1 { v13.h }[2], [x23], #0x2\n"
+    "ld1 { v14.h }[2], [x22], #0x2\n"
+    "ld1 { v15.h }[2], [x21], #0x2\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x27], #0x2\n"
+    "ld1 { v10.h }[0], [x26], #0x2\n"
+    "ld1 { v11.h }[0], [x25], #0x2\n"
+    "ld1 { v12.h }[0], [x24], #0x2\n"
+    "ld1 { v13.h }[0], [x23], #0x2\n"
+    "ld1 { v14.h }[0], [x22], #0x2\n"
+    "ld1 { v15.h }[0], [x21], #0x2\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "7:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: End
+    "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "ldr x20, [x13, #0x40]\n"
+    "add x20, x20, x28\n"
+    "mov v29.16b, v31.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "fmla v28.8h, v3.8h, v14.8h\n"
+    "fmla v29.8h, v2.8h, v13.8h\n"
+    "fmla v28.8h, v4.8h, v15.8h\n"
+    "mov v30.16b, v31.16b\n fmla v30.8h, v2.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v2.8h, v16.8h\n"
+    "fmla v29.8h, v0.8h, v16.8h\n"
+    "tbz %x[n_channels], #2, 9f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 11f\n"
+    "8:"  // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 11f\n"
+    "9:"  // Oddments: Load input (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "11:"  // Oddments: Load input (1, 3): Bit 2: End
+    "ldr x20, [x13, #0x48]\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 13f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load input (1, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load input (1, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "15:"  // Oddments: Load input (1, 4): Bit 2: End
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v29.8h, v5.8h, v12.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 17f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load input (1, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load input (1, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (1, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "19:"  // Oddments: Load input (1, 2): Bit 2: End
+    "ldr x20, [x13, #0x58]\n"
+    "fmla v28.8h, v5.8h, v13.8h\n"
+    "fmla v29.8h, v3.8h, v13.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 21f\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v14.h }[6], [x20], #0x2\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v14.h }[4], [x20], #0x2\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load input (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v14.h }[2], [x20], #0x2\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v14.h }[0], [x20], #0x2\n"
+    "23:"  // Oddments: Load input (3, 0): Bit 2: End
+    "ldr x20, [x13, #0x60]\n"
+    "fmla v30.8h, v3.8h, v14.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 25f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v15.h }[6], [x20], #0x2\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load input (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v15.h }[4], [x20], #0x2\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load input (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "27:"  // Oddments: Load input (2, 0): Bit 2: End
+    "ldr x20, [x13, #0x68]\n"
+    "fmla v28.8h, v6.8h, v15.8h\n"
+    "fmla v30.8h, v0.8h, v15.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 29f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load input (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "31:"  // Oddments: Load input (3, 1): Bit 2: End
+    "ldr x20, [x13, #0x70]\n"
+    "fmla v30.8h, v4.8h, v11.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 33f\n"
+    "ld1 { v16.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v16.h }[6], [x20], #0x2\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load input (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v16.h }[4], [x20], #0x2\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load input (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "35:"  // Oddments: Load input (2, 1): Bit 2: End
+    "ldr x20, [x13, #0x78]\n"
+    "fmla v28.8h, v7.8h, v16.8h\n"
+    "fmla v30.8h, v1.8h, v16.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 37f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load input (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "39:"  // Oddments: Load input (3, 3): Bit 2: End
+    "ldr x20, [x13, #0x80]\n"
+    "fmla v31.8h, v4.8h, v13.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 41f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load input (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load input (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "43:"  // Oddments: Load input (2, 3): Bit 2: End
+    "ldr x20, [x13, #0x88]\n"
+    "fmla v29.8h, v7.8h, v12.8h\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 45f\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v14.h }[6], [x20], #0x2\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load input (3, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v14.h }[4], [x20], #0x2\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load input (3, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v14.h }[2], [x20], #0x2\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v14.h }[0], [x20], #0x2\n"
+    "47:"  // Oddments: Load input (3, 4): Bit 2: End
+    "ldr x20, [x13, #0x90]\n"
+    "fmla v31.8h, v5.8h, v14.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 49f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v15.h }[6], [x20], #0x2\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load input (4, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v15.h }[4], [x20], #0x2\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load input (4, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (4, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "51:"  // Oddments: Load input (4, 0): Bit 2: End
+    "ldr x20, [x13, #0x98]\n"
+    "fmla v30.8h, v6.8h, v15.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 53f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load input (2, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load input (2, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 54f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "55:"  // Oddments: Load input (2, 4): Bit 2: End
+    "ldr x20, [x13, #0xa0]\n"
+    "fmla v29.8h, v8.8h, v11.8h\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 57f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 56f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load input (4, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load input (4, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 58f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "59:"  // Oddments: Load input (4, 1): Bit 2: End
+    "ldr x20, [x13, #0xa8]\n"
+    "fmla v30.8h, v7.8h, v13.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 61f\n"
+    "ld1 { v16.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 60f\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v16.h }[6], [x20], #0x2\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v16.h }[4], [x20], #0x2\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load input (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 62f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "63:"  // Oddments: Load input (3, 2): Bit 2: End
+    "ldr x20, [x13, #0xb0]\n"
+    "fmla v30.8h, v5.8h, v16.8h\n"
+    "fmla v31.8h, v3.8h, v16.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 65f\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 64f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v14.h }[6], [x20], #0x2\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load input (4, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v14.h }[4], [x20], #0x2\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load input (4, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 66f\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v14.h }[2], [x20], #0x2\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v14.h }[0], [x20], #0x2\n"
+    "67:"  // Oddments: Load input (4, 3): Bit 2: End
+    "ldr x20, [x13, #0xb8]\n"
+    "fmla v31.8h, v7.8h, v14.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 69f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 68f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v15.h }[6], [x20], #0x2\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load input (4, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v15.h }[4], [x20], #0x2\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load input (4, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 70f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "71:"  // Oddments: Load input (4, 2): Bit 2: End
+    "ldr x20, [x13, #0xc0]\n"
+    "fmla v30.8h, v8.8h, v15.8h\n"
+    "fmla v31.8h, v6.8h, v15.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 73f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 72f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load input (4, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load input (4, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 74f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "75:"  // Oddments: Load input (4, 4): Bit 2: End
+    "fmla v31.8h, v8.8h, v11.8h\n"
+    "fmax v28.8h, v28.8h, v26.8h\n"
+    "fmax v29.8h, v29.8h, v26.8h\n"
+    "fmax v30.8h, v30.8h, v26.8h\n"
+    "fmax v31.8h, v31.8h, v26.8h\n"
+    "fmin v28.8h, v28.8h, v27.8h\n"
+    "fmin v29.8h, v29.8h, v27.8h\n"
+    "fmin v30.8h, v30.8h, v27.8h\n"
+    "fmin v31.8h, v31.8h, v27.8h\n"
+    "tbz %x[n_channels], #2, 77f\n"
+    "st1 { v28.d }[0], [x12], #0x8\n"
+    "st1 { v29.d }[0], [x11], #0x8\n"
+    "st1 { v30.d }[0], [x10], #0x8\n"
+    "st1 { v31.d }[0], [x9], #0x8\n"
+    "tbz %x[n_channels], #1, 76f\n"
+    "st1 { v28.s }[2], [x12], #0x4\n"
+    "st1 { v29.s }[2], [x11], #0x4\n"
+    "st1 { v30.s }[2], [x10], #0x4\n"
+    "st1 { v31.s }[2], [x9], #0x4\n"
+    "tbz %x[n_channels], #0, 79f\n"
+    "st1 { v28.h }[6], [x12], #0x2\n"
+    "st1 { v29.h }[6], [x11], #0x2\n"
+    "st1 { v30.h }[6], [x10], #0x2\n"
+    "st1 { v31.h }[6], [x9], #0x2\n"
+    "b 79f\n"
+    "76:"  // Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 79f\n"
+    "st1 { v28.h }[4], [x12], #0x2\n"
+    "st1 { v29.h }[4], [x11], #0x2\n"
+    "st1 { v30.h }[4], [x10], #0x2\n"
+    "st1 { v31.h }[4], [x9], #0x2\n"
+    "b 79f\n"
+    "77:"  // Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 78f\n"
+    "st1 { v28.s }[0], [x12], #0x4\n"
+    "st1 { v29.s }[0], [x11], #0x4\n"
+    "st1 { v30.s }[0], [x10], #0x4\n"
+    "st1 { v31.s }[0], [x9], #0x4\n"
+    "tbz %x[n_channels], #0, 79f\n"
+    "st1 { v28.h }[2], [x12], #0x2\n"
+    "st1 { v29.h }[2], [x11], #0x2\n"
+    "st1 { v30.h }[2], [x10], #0x2\n"
+    "st1 { v31.h }[2], [x9], #0x2\n"
+    "b 79f\n"
+    "78:"  // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "st1 { v28.h }[0], [x12], #0x2\n"
+    "st1 { v29.h }[0], [x11], #0x2\n"
+    "st1 { v30.h }[0], [x10], #0x2\n"
+    "st1 { v31.h }[0], [x9], #0x2\n"
+    "79:"  // Oddments: Store: Bit 2: End
+    "80:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..1d1d491c28
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..cecaf79704
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,1387 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x27, #0x0\n"
+    "mov x26, #0x0\n"
+    "1:"  // Tile loop
+    "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x23, #0x2\n"
+    "mov x25, #0x2\n"
+    "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x27, x24\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x26, x2, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x3, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "lsl x2, x2, #0x1\n"
+    "mul x20, x27, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x4, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x6, x2, x2\n"
+    "mul x22, x22, x23\n"  // offset *= kernel_stride * output_size
+    "add x4, x4, x22, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x7, x4, x24, LSL #1\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x26, x3, x20\n"  // offset += tile_j * ld_output_col
+    "add x17, x7, x24, LSL #1\n"
+    "mov x23, #0x10\n"  // cntb _, ALL, #1
+    "mul x20, x20, x25\n"  // offset *= output_tile_size
+    "lsr x22, %x[n_channels], #0x3\n"
+    "add x16, x17, x24, LSL #1\n"
+    "add x15, x6, x2\n"
+    "add x14, x16, x24, LSL #1\n"
+    "add x13, x15, x2\n"
+    "add x5, x5, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v27.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v15.8h }, [x20]\n"
+    "add x12, x14, x24, LSL #1\n"
+    "add x11, x13, x2\n"
+    "add x10, x5, x21, LSL #1\n"
+    "lsl x3, x3, #0x1\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x23\n"
+    "cbz x22, 4f\n"
+    "ldr q25, [x8, #0x0]\n"
+    "ldr q0, [x8, #0x10]\n"
+    "cmp x23, x22, LSL #4\n"
+    "ldr q1, [x8, #0x20]\n"
+    "ldr q2, [x8, #0x30]\n"
+    "ldr q3, [x8, #0x40]\n"
+    "ldr q4, [x8, #0x50]\n"
+    "add x8, x8, #0x60\n"
+    "ld1 { v5.8h }, [x4]\n"
+    "ldr q6, [x4, x2]\n"
+    "ld1 { v7.8h }, [x7]\n"
+    "ldr q8, [x7, x2]\n"
+    "ldr q9, [x4, x6]\n"
+    "ldr q13, [x7, x6]\n"
+    "ldr q11, [x4, x15]\n"
+    "ldr q12, [x4, x13]\n"
+    "ldr q10, [x7, x11]\n"
+    "ld1 { v14.8h }, [x17]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v5.8h\n"
+    "ldr q23, [x7, x15]\n"
+    "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v6.8h\n"
+    "add x23, x23, #0x10\n"
+    "mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+    "mov v28.16b, v25.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+    "ldr q19, [x8, #0x0]\n"
+    "ldr q25, [x8, #0x140]\n"
+    "fmla v30.8h, v1.8h, v6.8h\n"
+    "ldr q21, [x7, x13]\n"
+    "fmla v31.8h, v1.8h, v9.8h\n"
+    "add x7, x7, #0x10\n"
+    "fmla v29.8h, v1.8h, v8.8h\n"
+    "fmla v28.8h, v1.8h, v13.8h\n"
+    "ldr q1, [x8, #0x10]\n"
+    "cmp x23, x22, LSL #4\n"
+    "fmla v30.8h, v2.8h, v9.8h\n"
+    "ldr q18, [x4, x11]\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "add x4, x4, #0x10\n"
+    "fmla v29.8h, v2.8h, v13.8h\n"
+    "fmla v28.8h, v2.8h, v23.8h\n"
+    "ldr q17, [x8, #0x20]\n"
+    "add x20, x20, #0x10\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "ldr q6, [x17, x2]\n"
+    "fmla v31.8h, v3.8h, v12.8h\n"
+    "add x21, x21, #0x10\n"
+    "fmla v29.8h, v3.8h, v23.8h\n"
+    "fmla v28.8h, v3.8h, v21.8h\n"
+    "ldr q16, [x8, #0x30]\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "ldr q2, [x17, x6]\n"
+    "fmla v31.8h, v4.8h, v18.8h\n"
+    "ldr q0, [x17, x15]\n"
+    "fmla v29.8h, v4.8h, v21.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "ldr q20, [x8, #0x40]\n"
+    "fmla v30.8h, v19.8h, v7.8h\n"
+    "ld1 { v7.8h }, [x7]\n"
+    "fmla v31.8h, v19.8h, v8.8h\n"
+    "fmla v29.8h, v19.8h, v14.8h\n"
+    "fmla v28.8h, v19.8h, v6.8h\n"
+    "ldr q19, [x8, #0x50]\n"
+    "fmla v30.8h, v1.8h, v8.8h\n"
+    "ldr q26, [x17, x11]\n"
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "fmla v29.8h, v1.8h, v6.8h\n"
+    "fmla v28.8h, v1.8h, v2.8h\n"
+    "ldr q18, [x8, #0x60]\n"
+    "fmla v30.8h, v17.8h, v13.8h\n"
+    "ldr q1, [x17, x13]\n"
+    "fmla v31.8h, v17.8h, v23.8h\n"
+    "add x17, x17, #0x10\n"
+    "fmla v29.8h, v17.8h, v2.8h\n"
+    "fmla v28.8h, v17.8h, v0.8h\n"
+    "ldr q17, [x8, #0x70]\n"
+    "fmla v30.8h, v16.8h, v23.8h\n"
+    "ld1 { v24.8h }, [x16]\n"
+    "fmla v31.8h, v16.8h, v21.8h\n"
+    "fmla v29.8h, v16.8h, v0.8h\n"
+    "fmla v28.8h, v16.8h, v1.8h\n"
+    "ldr q16, [x8, #0x80]\n"
+    "fmla v30.8h, v20.8h, v21.8h\n"
+    "ldr q23, [x16, x2]\n"
+    "fmla v31.8h, v20.8h, v10.8h\n"
+    "ldr q22, [x16, x6]\n"
+    "fmla v29.8h, v20.8h, v1.8h\n"
+    "fmla v28.8h, v20.8h, v26.8h\n"
+    "ldr q21, [x8, #0x90]\n"
+    "fmla v30.8h, v19.8h, v14.8h\n"
+    "ldr q5, [x16, x11]\n"
+    "fmla v31.8h, v19.8h, v6.8h\n"
+    "fmla v29.8h, v19.8h, v24.8h\n"
+    "fmla v28.8h, v19.8h, v23.8h\n"
+    "ldr q11, [x8, #0xa0]\n"
+    "fmla v30.8h, v18.8h, v6.8h\n"
+    "ldr q20, [x16, x15]\n"
+    "fmla v31.8h, v18.8h, v2.8h\n"
+    "fmla v29.8h, v18.8h, v23.8h\n"
+    "fmla v28.8h, v18.8h, v22.8h\n"
+    "ldr q18, [x8, #0xb0]\n"
+    "fmla v30.8h, v17.8h, v2.8h\n"
+    "ldr q19, [x16, x13]\n"
+    "fmla v31.8h, v17.8h, v0.8h\n"
+    "add x16, x16, #0x10\n"
+    "fmla v29.8h, v17.8h, v22.8h\n"
+    "fmla v28.8h, v17.8h, v20.8h\n"
+    "ldr q17, [x8, #0xc0]\n"
+    "fmla v30.8h, v16.8h, v0.8h\n"
+    "ld1 { v0.8h }, [x14]\n"
+    "fmla v31.8h, v16.8h, v1.8h\n"
+    "fmla v29.8h, v16.8h, v20.8h\n"
+    "fmla v28.8h, v16.8h, v19.8h\n"
+    "ldr q16, [x8, #0xd0]\n"
+    "fmla v30.8h, v21.8h, v1.8h\n"
+    "ldr q4, [x14, x2]\n"
+    "fmla v31.8h, v21.8h, v26.8h\n"
+    "ldr q12, [x14, x13]\n"
+    "fmla v29.8h, v21.8h, v19.8h\n"
+    "fmla v28.8h, v21.8h, v5.8h\n"
+    "ldr q13, [x8, #0xe0]\n"
+    "fmla v30.8h, v11.8h, v24.8h\n"
+    "ldr q6, [x14, x6]\n"
+    "fmla v31.8h, v11.8h, v23.8h\n"
+    "fmla v29.8h, v11.8h, v0.8h\n"
+    "fmla v28.8h, v11.8h, v4.8h\n"
+    "ldr q24, [x8, #0xf0]\n"
+    "fmla v30.8h, v18.8h, v23.8h\n"
+    "ldr q26, [x14, x15]\n"
+    "fmla v31.8h, v18.8h, v22.8h\n"
+    "fmla v29.8h, v18.8h, v4.8h\n"
+    "fmla v28.8h, v18.8h, v6.8h\n"
+    "ldr q23, [x8, #0x100]\n"
+    "fmla v30.8h, v17.8h, v22.8h\n"
+    "ldr q22, [x14, x11]\n"
+    "fmla v31.8h, v17.8h, v20.8h\n"
+    "add x14, x14, #0x10\n"
+    "fmla v29.8h, v17.8h, v6.8h\n"
+    "fmla v28.8h, v17.8h, v26.8h\n"
+    "ldr q21, [x8, #0x110]\n"
+    "fmla v30.8h, v16.8h, v20.8h\n"
+    "ld1 { v18.8h }, [x12]\n"
+    "fmla v31.8h, v16.8h, v19.8h\n"
+    "fmla v29.8h, v16.8h, v26.8h\n"
+    "fmla v28.8h, v16.8h, v12.8h\n"
+    "ldr q20, [x8, #0x120]\n"
+    "fmla v30.8h, v13.8h, v19.8h\n"
+    "ldr q17, [x12, x2]\n"
+    "fmla v31.8h, v13.8h, v5.8h\n"
+    "ld1 { v14.8h }, [x17]\n"
+    "fmla v29.8h, v13.8h, v12.8h\n"
+    "fmla v28.8h, v13.8h, v22.8h\n"
+    "ldr q19, [x8, #0x130]\n"
+    "fmla v30.8h, v24.8h, v0.8h\n"
+    "ldr q16, [x12, x6]\n"
+    "fmla v31.8h, v24.8h, v4.8h\n"
+    "fmla v29.8h, v24.8h, v18.8h\n"
+    "ldr q18, [x12, x15]\n"
+    "fmla v28.8h, v24.8h, v17.8h\n"
+    "ldr q0, [x8, #0x150]\n"
+    "fmla v30.8h, v23.8h, v4.8h\n"
+    "ldr q13, [x7, x6]\n"
+    "fmla v31.8h, v23.8h, v6.8h\n"
+    "fmla v29.8h, v23.8h, v17.8h\n"
+    "ldr q17, [x12, x13]\n"
+    "fmla v28.8h, v23.8h, v16.8h\n"
+    "ldr q1, [x8, #0x160]\n"
+    "fmla v30.8h, v21.8h, v6.8h\n"
+    "ld1 { v5.8h }, [x4]\n"
+    "fmla v31.8h, v21.8h, v26.8h\n"
+    "fmla v29.8h, v21.8h, v16.8h\n"
+    "ldr q16, [x12, x11]\n"
+    "fmla v28.8h, v21.8h, v18.8h\n"
+    "ldr q2, [x8, #0x170]\n"
+    "fmla v30.8h, v20.8h, v26.8h\n"
+    "ldr q6, [x4, x2]\n"
+    "fmla v31.8h, v20.8h, v12.8h\n"
+    "add x12, x12, #0x10\n"
+    "fmla v29.8h, v20.8h, v18.8h\n"
+    "ldr q11, [x4, x15]\n"
+    "fmla v28.8h, v20.8h, v17.8h\n"
+    "ldr q3, [x8, #0x180]\n"
+    "fmla v30.8h, v19.8h, v12.8h\n"
+    "ldr q8, [x7, x2]\n"
+    "fmla v31.8h, v19.8h, v22.8h\n"
+    "ldr q10, [x7, x11]\n"
+    "fmla v29.8h, v19.8h, v17.8h\n"
+    "ldr q12, [x4, x13]\n"
+    "fmla v28.8h, v19.8h, v16.8h\n"
+    "ldr q9, [x4, x6]\n"
+    "ldr q4, [x8, #0x190]\n"
+    "fmax v30.8h, v30.8h, v27.8h\n"
+    "fmax v31.8h, v31.8h, v27.8h\n"
+    "add x8, x8, #0x1a0\n"
+    "fmax v29.8h, v29.8h, v27.8h\n"
+    "fmax v28.8h, v28.8h, v27.8h\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "st1 { v30.8h }, [x5]\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "fmin v28.8h, v28.8h, v15.8h\n"
+    "str q31, [x5, x3]\n"
+    "add x5, x5, #0x10\n"
+    "st1 { v29.8h }, [x10]\n"
+    "str q28, [x10, x3]\n"
+    "add x10, x10, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+    "ldr q22, [x7, x15]\n"
+    "mov v5.16b, v25.16b\n fmla v5.8h, v0.8h, v6.8h\n"
+    "mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+    "mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v8.8h\n"
+    "ldr q19, [x8, #0x0]\n"
+    "fmla v31.8h, v1.8h, v6.8h\n"
+    "ldr q21, [x7, x13]\n"
+    "fmla v5.8h, v1.8h, v9.8h\n"
+    "add x7, x7, #0x10\n"
+    "fmla v30.8h, v1.8h, v8.8h\n"
+    "fmla v29.8h, v1.8h, v13.8h\n"
+    "ldr q18, [x8, #0x10]\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "ldr q16, [x4, x11]\n"
+    "fmla v5.8h, v2.8h, v11.8h\n"
+    "add x4, x4, #0x10\n"
+    "fmla v30.8h, v2.8h, v13.8h\n"
+    "fmla v29.8h, v2.8h, v22.8h\n"
+    "ldr q17, [x8, #0x20]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr q6, [x17, x2]\n"
+    "fmla v5.8h, v3.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v22.8h\n"
+    "fmla v29.8h, v3.8h, v21.8h\n"
+    "ldr q20, [x8, #0x30]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "ldr q2, [x17, x6]\n"
+    "fmla v5.8h, v4.8h, v16.8h\n"
+    "ldr q28, [x17, x15]\n"
+    "fmla v30.8h, v4.8h, v21.8h\n"
+    "fmla v29.8h, v4.8h, v10.8h\n"
+    "ldr q16, [x8, #0x40]\n"
+    "fmla v31.8h, v19.8h, v7.8h\n"
+    "fmla v5.8h, v19.8h, v8.8h\n"
+    "fmla v30.8h, v19.8h, v14.8h\n"
+    "fmla v29.8h, v19.8h, v6.8h\n"
+    "ldr q19, [x8, #0x50]\n"
+    "fmla v31.8h, v18.8h, v8.8h\n"
+    "ldr q1, [x17, x11]\n"
+    "fmla v5.8h, v18.8h, v13.8h\n"
+    "fmla v30.8h, v18.8h, v6.8h\n"
+    "fmla v29.8h, v18.8h, v2.8h\n"
+    "ldr q18, [x8, #0x60]\n"
+    "fmla v31.8h, v17.8h, v13.8h\n"
+    "ldr q26, [x17, x13]\n"
+    "fmla v5.8h, v17.8h, v22.8h\n"
+    "add x17, x17, #0x10\n"
+    "fmla v30.8h, v17.8h, v2.8h\n"
+    "fmla v29.8h, v17.8h, v28.8h\n"
+    "ldr q17, [x8, #0x70]\n"
+    "fmla v31.8h, v20.8h, v22.8h\n"
+    "ld1 { v25.8h }, [x16]\n"
+    "fmla v5.8h, v20.8h, v21.8h\n"
+    "fmla v30.8h, v20.8h, v28.8h\n"
+    "fmla v29.8h, v20.8h, v26.8h\n"
+    "ldr q24, [x8, #0x80]\n"
+    "fmla v31.8h, v16.8h, v21.8h\n"
+    "ldr q23, [x16, x2]\n"
+    "fmla v5.8h, v16.8h, v10.8h\n"
+    "ldr q0, [x16, x6]\n"
+    "fmla v30.8h, v16.8h, v26.8h\n"
+    "fmla v29.8h, v16.8h, v1.8h\n"
+    "ldr q22, [x8, #0x90]\n"
+    "fmla v31.8h, v19.8h, v14.8h\n"
+    "ldr q16, [x16, x11]\n"
+    "fmla v5.8h, v19.8h, v6.8h\n"
+    "fmla v30.8h, v19.8h, v25.8h\n"
+    "fmla v29.8h, v19.8h, v23.8h\n"
+    "ldr q21, [x8, #0xa0]\n"
+    "fmla v31.8h, v18.8h, v6.8h\n"
+    "ldr q20, [x16, x15]\n"
+    "fmla v5.8h, v18.8h, v2.8h\n"
+    "fmla v30.8h, v18.8h, v23.8h\n"
+    "fmla v29.8h, v18.8h, v0.8h\n"
+    "ldr q18, [x8, #0xb0]\n"
+    "fmla v31.8h, v17.8h, v2.8h\n"
+    "ldr q19, [x16, x13]\n"
+    "fmla v5.8h, v17.8h, v28.8h\n"
+    "add x16, x16, #0x10\n"
+    "fmla v30.8h, v17.8h, v0.8h\n"
+    "fmla v29.8h, v17.8h, v20.8h\n"
+    "ldr q17, [x8, #0xc0]\n"
+    "fmla v31.8h, v24.8h, v28.8h\n"
+    "ld1 { v7.8h }, [x14]\n"
+    "fmla v5.8h, v24.8h, v26.8h\n"
+    "fmla v30.8h, v24.8h, v20.8h\n"
+    "fmla v29.8h, v24.8h, v19.8h\n"
+    "ldr q2, [x8, #0xd0]\n"
+    "fmla v31.8h, v22.8h, v26.8h\n"
+    "ldr q28, [x14, x2]\n"
+    "fmla v5.8h, v22.8h, v1.8h\n"
+    "ldr q13, [x14, x13]\n"
+    "fmla v30.8h, v22.8h, v19.8h\n"
+    "fmla v29.8h, v22.8h, v16.8h\n"
+    "ldr q14, [x8, #0xe0]\n"
+    "fmla v31.8h, v21.8h, v25.8h\n"
+    "ldr q26, [x14, x6]\n"
+    "fmla v5.8h, v21.8h, v23.8h\n"
+    "fmla v30.8h, v21.8h, v7.8h\n"
+    "fmla v29.8h, v21.8h, v28.8h\n"
+    "ldr q25, [x8, #0xf0]\n"
+    "fmla v31.8h, v18.8h, v23.8h\n"
+    "ldr q24, [x14, x15]\n"
+    "fmla v5.8h, v18.8h, v0.8h\n"
+    "fmla v30.8h, v18.8h, v28.8h\n"
+    "fmla v29.8h, v18.8h, v26.8h\n"
+    "ldr q23, [x8, #0x100]\n"
+    "fmla v31.8h, v17.8h, v0.8h\n"
+    "ldr q22, [x14, x11]\n"
+    "fmla v5.8h, v17.8h, v20.8h\n"
+    "add x14, x14, #0x10\n"
+    "fmla v30.8h, v17.8h, v26.8h\n"
+    "fmla v29.8h, v17.8h, v24.8h\n"
+    "ldr q21, [x8, #0x110]\n"
+    "fmla v31.8h, v2.8h, v20.8h\n"
+    "ld1 { v18.8h }, [x12]\n"
+    "fmla v5.8h, v2.8h, v19.8h\n"
+    "fmla v30.8h, v2.8h, v24.8h\n"
+    "fmla v29.8h, v2.8h, v13.8h\n"
+    "ldr q20, [x8, #0x120]\n"
+    "fmla v31.8h, v14.8h, v19.8h\n"
+    "ldr q17, [x12, x2]\n"
+    "fmla v5.8h, v14.8h, v16.8h\n"
+    "fmla v30.8h, v14.8h, v13.8h\n"
+    "fmla v29.8h, v14.8h, v22.8h\n"
+    "ldr q19, [x8, #0x130]\n"
+    "add x8, x8, #0x140\n"
+    "fmla v31.8h, v25.8h, v7.8h\n"
+    "ldr q16, [x12, x6]\n"
+    "fmla v5.8h, v25.8h, v28.8h\n"
+    "fmla v30.8h, v25.8h, v18.8h\n"
+    "ldr q18, [x12, x15]\n"
+    "fmla v29.8h, v25.8h, v17.8h\n"
+    "fmla v31.8h, v23.8h, v28.8h\n"
+    "fmla v5.8h, v23.8h, v26.8h\n"
+    "fmla v30.8h, v23.8h, v17.8h\n"
+    "ldr q17, [x12, x13]\n"
+    "fmla v29.8h, v23.8h, v16.8h\n"
+    "fmla v31.8h, v21.8h, v26.8h\n"
+    "fmla v5.8h, v21.8h, v24.8h\n"
+    "fmla v30.8h, v21.8h, v16.8h\n"
+    "ldr q16, [x12, x11]\n"
+    "fmla v29.8h, v21.8h, v18.8h\n"
+    "add x12, x12, #0x10\n"
+    "fmla v31.8h, v20.8h, v24.8h\n"
+    "fmla v5.8h, v20.8h, v13.8h\n"
+    "fmla v30.8h, v20.8h, v18.8h\n"
+    "fmla v29.8h, v20.8h, v17.8h\n"
+    "fmla v31.8h, v19.8h, v13.8h\n"
+    "fmla v5.8h, v19.8h, v22.8h\n"
+    "fmax v31.8h, v31.8h, v27.8h\n"
+    "fmla v30.8h, v19.8h, v17.8h\n"
+    "fmla v29.8h, v19.8h, v16.8h\n"
+    "fmax v5.8h, v5.8h, v27.8h\n"
+    "fmax v30.8h, v30.8h, v27.8h\n"
+    "fmax v29.8h, v29.8h, v27.8h\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "fmin v5.8h, v5.8h, v15.8h\n"
+    "st1 { v31.8h }, [x5]\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "str q5, [x5, x3]\n"
+    "add x5, x5, #0x10\n"
+    "st1 { v30.8h }, [x10]\n"
+    "str q29, [x10, x3]\n"
+    "add x10, x10, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 117f\n"
+    "ldr q25, [x8, #0x0]\n"
+    "ldr q0, [x8, #0x10]\n"
+    "add x9, x4, XZR\n"
+    "add x28, x4, x2\n"
+    "ldr q1, [x8, #0x20]\n"
+    "ldr q2, [x8, #0x30]\n"
+    "add x27, x7, XZR\n"
+    "add x26, x7, x2\n"
+    "ldr q3, [x8, #0x40]\n"
+    "ldr q4, [x8, #0x50]\n"
+    "add x25, x4, x6\n"
+    "add x24, x7, x6\n"
+    "add x23, x4, x15\n"
+    "add x22, x4, x13\n"
+    "add x21, x7, x11\n"
+    "add x20, x17, XZR\n"
+    "add x8, x8, #0x60\n"
+    "tbz %x[n_channels], #2, 6f\n"
+    "ldr d5, [x9], #0x8\n"
+    "ldr d6, [x28], #0x8\n"
+    "ldr d7, [x27], #0x8\n"
+    "ldr d8, [x26], #0x8\n"
+    "ldr d9, [x25], #0x8\n"
+    "ldr d13, [x24], #0x8\n"
+    "ldr d11, [x23], #0x8\n"
+    "ldr d12, [x22], #0x8\n"
+    "ldr d10, [x21], #0x8\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v5.s }[2], [x9], #0x4\n"
+    "ld1 { v6.s }[2], [x28], #0x4\n"
+    "ld1 { v7.s }[2], [x27], #0x4\n"
+    "ld1 { v8.s }[2], [x26], #0x4\n"
+    "ld1 { v9.s }[2], [x25], #0x4\n"
+    "ld1 { v13.s }[2], [x24], #0x4\n"
+    "ld1 { v11.s }[2], [x23], #0x4\n"
+    "ld1 { v12.s }[2], [x22], #0x4\n"
+    "ld1 { v10.s }[2], [x21], #0x4\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v5.h }[6], [x9]\n"
+    "ld1 { v6.h }[6], [x28]\n"
+    "ld1 { v7.h }[6], [x27]\n"
+    "ld1 { v8.h }[6], [x26]\n"
+    "ld1 { v9.h }[6], [x25]\n"
+    "ld1 { v13.h }[6], [x24]\n"
+    "ld1 { v11.h }[6], [x23]\n"
+    "ld1 { v12.h }[6], [x22]\n"
+    "ld1 { v10.h }[6], [x21]\n"
+    "ld1 { v14.h }[6], [x20]\n"
+    "b 8f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v5.h }[4], [x9]\n"
+    "ld1 { v6.h }[4], [x28]\n"
+    "ld1 { v7.h }[4], [x27]\n"
+    "ld1 { v8.h }[4], [x26]\n"
+    "ld1 { v9.h }[4], [x25]\n"
+    "ld1 { v13.h }[4], [x24]\n"
+    "ld1 { v11.h }[4], [x23]\n"
+    "ld1 { v12.h }[4], [x22]\n"
+    "ld1 { v10.h }[4], [x21]\n"
+    "ld1 { v14.h }[4], [x20]\n"
+    "b 8f\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr s5, [x9], #0x4\n"
+    "ldr s6, [x28], #0x4\n"
+    "ldr s7, [x27], #0x4\n"
+    "ldr s8, [x26], #0x4\n"
+    "ldr s9, [x25], #0x4\n"
+    "ldr s13, [x24], #0x4\n"
+    "ldr s11, [x23], #0x4\n"
+    "ldr s12, [x22], #0x4\n"
+    "ldr s10, [x21], #0x4\n"
+    "ldr s14, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v5.h }[2], [x9]\n"
+    "ld1 { v6.h }[2], [x28]\n"
+    "ld1 { v7.h }[2], [x27]\n"
+    "ld1 { v8.h }[2], [x26]\n"
+    "ld1 { v9.h }[2], [x25]\n"
+    "ld1 { v13.h }[2], [x24]\n"
+    "ld1 { v11.h }[2], [x23]\n"
+    "ld1 { v12.h }[2], [x22]\n"
+    "ld1 { v10.h }[2], [x21]\n"
+    "ld1 { v14.h }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h5, [x9, #0x0]\n"
+    "ldr h6, [x28, #0x0]\n"
+    "ldr h7, [x27, #0x0]\n"
+    "ldr h8, [x26, #0x0]\n"
+    "ldr h9, [x25, #0x0]\n"
+    "ldr h13, [x24, #0x0]\n"
+    "ldr h11, [x23, #0x0]\n"
+    "ldr h12, [x22, #0x0]\n"
+    "ldr h10, [x21, #0x0]\n"
+    "ldr h14, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: End
+    "mov v28.16b, v25.16b\n fmla v28.8h, v0.8h, v5.8h\n"
+    "mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v6.8h\n"
+    "add x20, x7, x15\n"
+    "mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+    "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v8.8h\n"
+    "fmla v28.8h, v1.8h, v6.8h\n"
+    "fmla v29.8h, v1.8h, v9.8h\n"
+    "fmla v30.8h, v1.8h, v8.8h\n"
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "fmla v30.8h, v2.8h, v13.8h\n"
+    "tbz %x[n_channels], #2, 10f\n"
+    "ldr d5, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v5.h }[6], [x20]\n"
+    "b 12f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v5.h }[4], [x20]\n"
+    "b 12f\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr s5, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v5.h }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h5, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
+    "fmla v31.8h, v2.8h, v5.8h\n"
+    "fmla v28.8h, v3.8h, v11.8h\n"
+    "add x20, x7, x13\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v5.8h\n"
+    "tbz %x[n_channels], #2, 14f\n"
+    "ldr d6, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v6.h }[6], [x20]\n"
+    "b 16f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v6.h }[4], [x20]\n"
+    "b 16f\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr s6, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v6.h }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h6, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
+    "fmla v31.8h, v3.8h, v6.8h\n"
+    "fmla v28.8h, v4.8h, v12.8h\n"
+    "add x20, x4, x11\n"
+    "tbz %x[n_channels], #2, 18f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 20f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 20f\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: End
+    "ldr q0, [x8, #0x0]\n"
+    "fmla v29.8h, v4.8h, v9.8h\n"
+    "fmla v30.8h, v4.8h, v6.8h\n"
+    "add x20, x17, x2\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "fmla v28.8h, v0.8h, v7.8h\n"
+    "add x8, x8, #0x10\n"
+    "fmla v29.8h, v0.8h, v8.8h\n"
+    "fmla v30.8h, v0.8h, v14.8h\n"
+    "tbz %x[n_channels], #2, 22f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 24f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 24f\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
+    "ldr q1, [x8, #0x0]\n"
+    "fmla v31.8h, v0.8h, v11.8h\n"
+    "fmla v28.8h, v1.8h, v8.8h\n"
+    "add x20, x17, x6\n"
+    "fmla v29.8h, v1.8h, v13.8h\n"
+    "fmla v30.8h, v1.8h, v11.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 26f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 28f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 28f\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: End
+    "ldr q2, [x8, #0x0]\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "add x20, x17, x15\n"
+    "fmla v29.8h, v2.8h, v5.8h\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 30f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 32f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 32f\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
+    "ldr q3, [x8, #0x0]\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "fmla v28.8h, v3.8h, v5.8h\n"
+    "add x20, x17, x13\n"
+    "fmla v29.8h, v3.8h, v6.8h\n"
+    "fmla v30.8h, v3.8h, v9.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 34f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 36f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 36f\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
+    "ldr q4, [x8, #0x0]\n"
+    "fmla v31.8h, v3.8h, v13.8h\n"
+    "fmla v28.8h, v4.8h, v6.8h\n"
+    "add x20, x17, x11\n"
+    "fmla v29.8h, v4.8h, v10.8h\n"
+    "fmla v30.8h, v4.8h, v13.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 38f\n"
+    "ldr d8, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ld1 { v8.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v8.h }[6], [x20]\n"
+    "b 40f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v8.h }[4], [x20]\n"
+    "b 40f\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr s8, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v8.h }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h8, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: End
+    "ldr q0, [x8, #0x0]\n"
+    "fmla v31.8h, v4.8h, v8.8h\n"
+    "fmla v28.8h, v0.8h, v14.8h\n"
+    "add x20, x16, XZR\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 42f\n"
+    "ldr d5, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v5.h }[6], [x20]\n"
+    "b 44f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v5.h }[4], [x20]\n"
+    "b 44f\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr s5, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v5.h }[2], [x20]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h5, [x20, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
+    "fmla v30.8h, v0.8h, v5.8h\n"
+    "add x20, x16, x2\n"
+    "tbz %x[n_channels], #2, 46f\n"
+    "ldr d6, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v6.h }[6], [x20]\n"
+    "b 48f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v6.h }[4], [x20]\n"
+    "b 48f\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr s6, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v6.h }[2], [x20]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h6, [x20, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
+    "ldr q1, [x8, #0x0]\n"
+    "fmla v31.8h, v0.8h, v6.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "add x20, x16, x6\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "fmla v30.8h, v1.8h, v6.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 50f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 52f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 52f\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
+    "ldr q2, [x8, #0x0]\n"
+    "fmla v31.8h, v1.8h, v10.8h\n"
+    "fmla v28.8h, v2.8h, v12.8h\n"
+    "add x20, x16, x15\n"
+    "fmla v29.8h, v2.8h, v9.8h\n"
+    "fmla v30.8h, v2.8h, v10.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 54f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 56f\n"
+    "53:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 56f\n"
+    "54:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 55f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "56:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
+    "ldr q3, [x8, #0x0]\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v9.8h\n"
+    "add x20, x16, x13\n"
+    "fmla v29.8h, v3.8h, v13.8h\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 58f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 57f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 60f\n"
+    "57:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 60f\n"
+    "58:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 59f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 60f\n"
+    "59:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "60:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
+    "ldr q4, [x8, #0x0]\n"
+    "fmla v31.8h, v3.8h, v12.8h\n"
+    "fmla v28.8h, v4.8h, v13.8h\n"
+    "add x20, x16, x11\n"
+    "fmla v29.8h, v4.8h, v8.8h\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 62f\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 61f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v14.h }[6], [x20]\n"
+    "b 64f\n"
+    "61:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v14.h }[4], [x20]\n"
+    "b 64f\n"
+    "62:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 63f\n"
+    "ldr s14, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v14.h }[2], [x20]\n"
+    "b 64f\n"
+    "63:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h14, [x20, #0x0]\n"
+    "64:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: End
+    "ldr q0, [x8, #0x0]\n"
+    "fmla v31.8h, v4.8h, v14.8h\n"
+    "fmla v28.8h, v0.8h, v5.8h\n"
+    "add x20, x14, XZR\n"
+    "fmla v29.8h, v0.8h, v6.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 66f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 65f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 68f\n"
+    "65:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 68f\n"
+    "66:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 67f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 68f\n"
+    "67:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "68:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: End
+    "fmla v30.8h, v0.8h, v9.8h\n"
+    "add x20, x14, x2\n"
+    "tbz %x[n_channels], #2, 70f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 69f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 72f\n"
+    "69:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 72f\n"
+    "70:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 71f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 72f\n"
+    "71:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "72:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
+    "ldr q1, [x8, #0x0]\n"
+    "fmla v31.8h, v0.8h, v13.8h\n"
+    "fmla v28.8h, v1.8h, v6.8h\n"
+    "add x20, x14, x6\n"
+    "fmla v29.8h, v1.8h, v10.8h\n"
+    "fmla v30.8h, v1.8h, v13.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 74f\n"
+    "ldr d5, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 73f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v5.h }[6], [x20]\n"
+    "b 76f\n"
+    "73:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v5.h }[4], [x20]\n"
+    "b 76f\n"
+    "74:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 75f\n"
+    "ldr s5, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v5.h }[2], [x20]\n"
+    "b 76f\n"
+    "75:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h5, [x20, #0x0]\n"
+    "76:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
+    "ldr q2, [x8, #0x0]\n"
+    "fmla v31.8h, v1.8h, v5.8h\n"
+    "fmla v28.8h, v2.8h, v10.8h\n"
+    "add x20, x14, x15\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "fmla v30.8h, v2.8h, v5.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 78f\n"
+    "ldr d6, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 77f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v6.h }[6], [x20]\n"
+    "b 80f\n"
+    "77:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v6.h }[4], [x20]\n"
+    "b 80f\n"
+    "78:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 79f\n"
+    "ldr s6, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v6.h }[2], [x20]\n"
+    "b 80f\n"
+    "79:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h6, [x20, #0x0]\n"
+    "80:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
+    "ldr q3, [x8, #0x0]\n"
+    "fmla v31.8h, v2.8h, v6.8h\n"
+    "fmla v28.8h, v3.8h, v11.8h\n"
+    "add x20, x14, x13\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v6.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 82f\n"
+    "ldr d8, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 81f\n"
+    "ld1 { v8.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v8.h }[6], [x20]\n"
+    "b 84f\n"
+    "81:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v8.h }[4], [x20]\n"
+    "b 84f\n"
+    "82:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 83f\n"
+    "ldr s8, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v8.h }[2], [x20]\n"
+    "b 84f\n"
+    "83:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h8, [x20, #0x0]\n"
+    "84:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
+    "ldr q4, [x8, #0x0]\n"
+    "fmla v31.8h, v3.8h, v8.8h\n"
+    "fmla v28.8h, v4.8h, v12.8h\n"
+    "add x20, x14, x11\n"
+    "fmla v29.8h, v4.8h, v14.8h\n"
+    "fmla v30.8h, v4.8h, v8.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 86f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 85f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 88f\n"
+    "85:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 88f\n"
+    "86:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 87f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 88f\n"
+    "87:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "88:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: End
+    "ldr q0, [x8, #0x0]\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "fmla v28.8h, v0.8h, v9.8h\n"
+    "add x20, x12, XZR\n"
+    "fmla v29.8h, v0.8h, v13.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 90f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 89f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 92f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 92f\n"
+    "89:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 92f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 92f\n"
+    "90:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 91f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 92f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 92f\n"
+    "91:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "92:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: End
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "add x20, x12, x2\n"
+    "tbz %x[n_channels], #2, 94f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 93f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 96f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 96f\n"
+    "93:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 96f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 96f\n"
+    "94:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 95f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 96f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 96f\n"
+    "95:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "96:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: End
+    "ldr q1, [x8, #0x0]\n"
+    "fmla v31.8h, v0.8h, v12.8h\n"
+    "fmla v28.8h, v1.8h, v13.8h\n"
+    "add x20, x12, x6\n"
+    "fmla v29.8h, v1.8h, v5.8h\n"
+    "fmla v30.8h, v1.8h, v12.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 98f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 97f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 100f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 100f\n"
+    "97:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 100f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 100f\n"
+    "98:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 99f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 100f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 100f\n"
+    "99:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "100:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: End
+    "ldr q2, [x8, #0x0]\n"
+    "fmla v31.8h, v1.8h, v9.8h\n"
+    "fmla v28.8h, v2.8h, v5.8h\n"
+    "add x20, x12, x15\n"
+    "fmla v29.8h, v2.8h, v6.8h\n"
+    "fmla v30.8h, v2.8h, v9.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 102f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 101f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 104f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 104f\n"
+    "101:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 104f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 104f\n"
+    "102:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 103f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 104f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 104f\n"
+    "103:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "104:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: End
+    "ldr q3, [x8, #0x0]\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v6.8h\n"
+    "add x20, x12, x13\n"
+    "fmla v29.8h, v3.8h, v8.8h\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 106f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 105f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 108f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 108f\n"
+    "105:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 108f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 108f\n"
+    "106:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 107f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 108f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 108f\n"
+    "107:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "108:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: End
+    "ldr q4, [x8, #0x0]\n"
+    "fmla v31.8h, v3.8h, v12.8h\n"
+    "fmla v28.8h, v4.8h, v8.8h\n"
+    "add x20, x12, x11\n"
+    "fmla v29.8h, v4.8h, v10.8h\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 110f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 109f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 112f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 112f\n"
+    "109:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 112f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 112f\n"
+    "110:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 111f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 112f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 112f\n"
+    "111:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "112:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: End
+    "fmla v31.8h, v4.8h, v9.8h\n"
+    "fmax v28.8h, v28.8h, v27.8h\n"
+    "fmax v29.8h, v29.8h, v27.8h\n"
+    "fmax v30.8h, v30.8h, v27.8h\n"
+    "fmax v31.8h, v31.8h, v27.8h\n"
+    "fmin v28.8h, v28.8h, v15.8h\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "tbz %x[n_channels], #2, 114f\n"
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.d }[0], [x21], x3\n"
+    "st1 { v30.d }[0], [x20], x3\n"
+    "add x5, x5, #0x8\n"
+    "add x10, x10, #0x8\n"
+    "st1 { v29.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #1, 113f\n"
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.s }[2], [x21], x3\n"
+    "st1 { v30.s }[2], [x20], x3\n"
+    "add x5, x5, #0x4\n"
+    "add x10, x10, #0x4\n"
+    "st1 { v29.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "tbz %x[n_channels], #0, 116f\n"
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.h }[6], [x21], x3\n"
+    "st1 { v30.h }[6], [x20], x3\n"
+    "st1 { v29.h }[6], [x21]\n"
+    "st1 { v31.h }[6], [x20]\n"
+    "b 116f\n"
+    "113:"  // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 116f\n"
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.h }[4], [x21], x3\n"
+    "st1 { v30.h }[4], [x20], x3\n"
+    "st1 { v29.h }[4], [x21]\n"
+    "st1 { v31.h }[4], [x20]\n"
+    "b 116f\n"
+    "114:"  // Tile loop: Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 115f\n"
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.s }[0], [x21], x3\n"
+    "st1 { v30.s }[0], [x20], x3\n"
+    "add x5, x5, #0x4\n"
+    "add x10, x10, #0x4\n"
+    "st1 { v29.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 116f\n"
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.h }[2], [x21], x3\n"
+    "st1 { v30.h }[2], [x20], x3\n"
+    "st1 { v29.h }[2], [x21]\n"
+    "st1 { v31.h }[2], [x20]\n"
+    "b 116f\n"
+    "115:"  // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.h }[0], [x21], x3\n"
+    "st1 { v30.h }[0], [x20], x3\n"
+    "st1 { v29.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "116:"  // Tile loop: Oddments: Store: Bit 2: End
+    "117:"  // Tile loop: End
+    "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x26, x26, #0x1\n"
+    "add x21, x27, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x26, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x27, x27, x21, LT\n"
+    "csel x26, x26, XZR, LT\n"
+    "cmp x27, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..4913340c4c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1427 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[36];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[0];
+      inptrs[1] = input_ptrs[1];
+      inptrs[2] = input_ptrs[6];
+      inptrs[3] = input_ptrs[7];
+      inptrs[4] = input_ptrs[2];
+      inptrs[5] = input_ptrs[8];
+      inptrs[6] = input_ptrs[3];
+      inptrs[7] = input_ptrs[4];
+      inptrs[8] = input_ptrs[11];
+      inptrs[9] = input_ptrs[12];
+      inptrs[10] = input_ptrs[9];
+      inptrs[11] = input_ptrs[10];
+      inptrs[12] = input_ptrs[5];
+      inptrs[13] = input_ptrs[13];
+      inptrs[14] = input_ptrs[14];
+      inptrs[15] = input_ptrs[15];
+      inptrs[16] = input_ptrs[16];
+      inptrs[17] = input_ptrs[17];
+      inptrs[18] = input_ptrs[18];
+      inptrs[19] = input_ptrs[19];
+      inptrs[20] = input_ptrs[20];
+      inptrs[21] = input_ptrs[21];
+      inptrs[22] = input_ptrs[22];
+      inptrs[23] = input_ptrs[23];
+      inptrs[24] = input_ptrs[24];
+      inptrs[25] = input_ptrs[25];
+      inptrs[26] = input_ptrs[26];
+      inptrs[27] = input_ptrs[27];
+      inptrs[28] = input_ptrs[28];
+      inptrs[29] = input_ptrs[29];
+      inptrs[30] = input_ptrs[30];
+      inptrs[31] = input_ptrs[31];
+      inptrs[32] = input_ptrs[32];
+      inptrs[33] = input_ptrs[33];
+      inptrs[34] = input_ptrs[34];
+      inptrs[35] = input_ptrs[35];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "mov x17, #0x10\n"  // cntb _, ALL, #1
+    "lsr x9, %x[n_channels], #0x3\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v27.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v15.8h }, [x20]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldp x14, x13, [x21, #0x0]\n"
+    "ldp x12, x11, [x21, #0x10]\n"
+    "mov x10, #0x0\n"
+    "sub x28, XZR, x17\n"
+    "cbz x9, 3f\n"
+    "ldr q26, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "cmp x17, x9, LSL #4\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "add x16, x16, #0x60\n"
+    "ldp x21, x20, [x15, #0x0]\n"
+    "ldr q5, [x21, x10]\n"
+    "ldr q6, [x20, x10]\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr q7, [x21, x10]\n"
+    "ldr q8, [x20, x10]\n"
+    "ldp x21, x20, [x15, #0x20]\n"
+    "ldr q9, [x21, x10]\n"
+    "ldr q13, [x20, x10]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ldr q11, [x21, x10]\n"
+    "ldr q12, [x20, x10]\n"
+    "ldp x21, x20, [x15, #0x40]\n"
+    "ldr q10, [x21, x10]\n"
+    "ldr q14, [x20, x10]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v5.8h\n"
+    "mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v6.8h\n"
+    "ldr x20, [x15, #0x50]\n"
+    "ldr q24, [x20, x10]\n"
+    "mov v28.16b, v26.16b\n fmla v28.8h, v0.8h, v7.8h\n"
+    "mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v8.8h\n"
+    "ldr q23, [x16, #0x0]\n"
+    "ldr q26, [x16, #0x140]\n"
+    "fmla v30.8h, v1.8h, v6.8h\n"
+    "fmla v31.8h, v1.8h, v9.8h\n"
+    "ldr x20, [x15, #0x58]\n"
+    "ldr q22, [x20, x10]\n"
+    "fmla v28.8h, v1.8h, v8.8h\n"
+    "fmla v29.8h, v1.8h, v13.8h\n"
+    "ldr q21, [x16, #0x10]\n"
+    "ldr x20, [x15, #0x60]\n"
+    "fmla v30.8h, v2.8h, v9.8h\n"
+    "ldr q17, [x20, x10]\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "ldr x20, [x15, #0x68]\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "fmla v29.8h, v2.8h, v24.8h\n"
+    "ldr q16, [x16, #0x20]\n"
+    "ldr x22, [x15, #0x70]\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "ldr q5, [x20, x10]\n"
+    "fmla v31.8h, v3.8h, v12.8h\n"
+    "ldr x20, [x15, #0x78]\n"
+    "fmla v28.8h, v3.8h, v24.8h\n"
+    "fmla v29.8h, v3.8h, v22.8h\n"
+    "ldr q20, [x16, #0x30]\n"
+    "ldr x21, [x15, #0x80]\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "ldr q19, [x22, x10]\n"
+    "fmla v31.8h, v4.8h, v17.8h\n"
+    "ldr q2, [x20, x10]\n"
+    "fmla v28.8h, v4.8h, v22.8h\n"
+    "fmla v29.8h, v4.8h, v10.8h\n"
+    "ldr q18, [x16, #0x40]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla v30.8h, v23.8h, v7.8h\n"
+    "fmla v31.8h, v23.8h, v8.8h\n"
+    "ldr x23, [x15, #0x90]\n"
+    "ldr x26, [x15, #0x98]\n"
+    "fmla v28.8h, v23.8h, v14.8h\n"
+    "fmla v29.8h, v23.8h, v5.8h\n"
+    "ldr q1, [x16, #0x50]\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "fmla v30.8h, v21.8h, v8.8h\n"
+    "ldr q25, [x20, x10]\n"
+    "fmla v31.8h, v21.8h, v13.8h\n"
+    "ldr x25, [x15, #0xa8]\n"
+    "fmla v28.8h, v21.8h, v5.8h\n"
+    "fmla v29.8h, v21.8h, v19.8h\n"
+    "ldr q17, [x16, #0x60]\n"
+    "ldr x24, [x15, #0xb0]\n"
+    "fmla v30.8h, v16.8h, v13.8h\n"
+    "ldr q8, [x21, x10]\n"
+    "fmla v31.8h, v16.8h, v24.8h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v28.8h, v16.8h, v19.8h\n"
+    "fmla v29.8h, v16.8h, v2.8h\n"
+    "ldr q16, [x16, #0x70]\n"
+    "ldr x21, [x15, #0xc0]\n"
+    "fmla v30.8h, v20.8h, v24.8h\n"
+    "ldr q24, [x23, x10]\n"
+    "fmla v31.8h, v20.8h, v22.8h\n"
+    "ldr x27, [x15, #0xc8]\n"
+    "fmla v28.8h, v20.8h, v2.8h\n"
+    "fmla v29.8h, v20.8h, v8.8h\n"
+    "ldr q23, [x16, #0x80]\n"
+    "ldr x23, [x15, #0xd0]\n"
+    "fmla v30.8h, v18.8h, v22.8h\n"
+    "ldr q22, [x26, x10]\n"
+    "fmla v31.8h, v18.8h, v10.8h\n"
+    "ldr q21, [x22, x10]\n"
+    "fmla v28.8h, v18.8h, v8.8h\n"
+    "fmla v29.8h, v18.8h, v25.8h\n"
+    "ldr q20, [x16, #0x90]\n"
+    "ldr x22, [x15, #0xd8]\n"
+    "fmla v30.8h, v1.8h, v14.8h\n"
+    "ldr q0, [x20, x10]\n"
+    "fmla v31.8h, v1.8h, v5.8h\n"
+    "ldr x20, [x15, #0xe0]\n"
+    "fmla v28.8h, v1.8h, v24.8h\n"
+    "fmla v29.8h, v1.8h, v22.8h\n"
+    "ldr q6, [x16, #0xa0]\n"
+    "ldr x26, [x15, #0xf8]\n"
+    "fmla v30.8h, v17.8h, v5.8h\n"
+    "ldr q1, [x25, x10]\n"
+    "fmla v31.8h, v17.8h, v19.8h\n"
+    "ldr x25, [x15, #0xe8]\n"
+    "fmla v28.8h, v17.8h, v22.8h\n"
+    "fmla v29.8h, v17.8h, v21.8h\n"
+    "ldr q18, [x16, #0xb0]\n"
+    "add x28, x28, #0x10\n"
+    "fmla v30.8h, v16.8h, v19.8h\n"
+    "ldr q19, [x24, x10]\n"
+    "fmla v31.8h, v16.8h, v2.8h\n"
+    "ldr x24, [x15, #0xf0]\n"
+    "fmla v28.8h, v16.8h, v21.8h\n"
+    "fmla v29.8h, v16.8h, v1.8h\n"
+    "ldr q17, [x16, #0xc0]\n"
+    "fmla v30.8h, v23.8h, v2.8h\n"
+    "ldr q16, [x21, x10]\n"
+    "fmla v31.8h, v23.8h, v8.8h\n"
+    "ldr x21, [x15, #0x100]\n"
+    "fmla v28.8h, v23.8h, v1.8h\n"
+    "fmla v29.8h, v23.8h, v19.8h\n"
+    "ldr q13, [x16, #0xd0]\n"
+    "fmla v30.8h, v20.8h, v8.8h\n"
+    "ldr q2, [x27, x10]\n"
+    "fmla v31.8h, v20.8h, v25.8h\n"
+    "ldr q10, [x20, x10]\n"
+    "fmla v28.8h, v20.8h, v19.8h\n"
+    "fmla v29.8h, v20.8h, v0.8h\n"
+    "ldr q9, [x16, #0xe0]\n"
+    "ldr x20, [x15, #0x108]\n"
+    "fmla v30.8h, v6.8h, v24.8h\n"
+    "ldr q5, [x23, x10]\n"
+    "fmla v31.8h, v6.8h, v22.8h\n"
+    "ldr x23, [x15, #0x110]\n"
+    "fmla v28.8h, v6.8h, v16.8h\n"
+    "fmla v29.8h, v6.8h, v2.8h\n"
+    "ldr q24, [x16, #0xf0]\n"
+    "fmla v30.8h, v18.8h, v22.8h\n"
+    "ldr q25, [x22, x10]\n"
+    "fmla v31.8h, v18.8h, v21.8h\n"
+    "ldr x22, [x15, #0x118]\n"
+    "fmla v28.8h, v18.8h, v2.8h\n"
+    "fmla v29.8h, v18.8h, v5.8h\n"
+    "ldr q23, [x16, #0x100]\n"
+    "fmla v30.8h, v17.8h, v21.8h\n"
+    "ldr q22, [x25, x10]\n"
+    "fmla v31.8h, v17.8h, v1.8h\n"
+    "fmla v28.8h, v17.8h, v5.8h\n"
+    "fmla v29.8h, v17.8h, v25.8h\n"
+    "ldr q21, [x16, #0x110]\n"
+    "fmla v30.8h, v13.8h, v1.8h\n"
+    "ldr q18, [x24, x10]\n"
+    "fmla v31.8h, v13.8h, v19.8h\n"
+    "fmla v28.8h, v13.8h, v25.8h\n"
+    "fmla v29.8h, v13.8h, v10.8h\n"
+    "ldr q20, [x16, #0x120]\n"
+    "fmla v30.8h, v9.8h, v19.8h\n"
+    "ldr q17, [x26, x10]\n"
+    "fmla v31.8h, v9.8h, v0.8h\n"
+    "fmla v28.8h, v9.8h, v10.8h\n"
+    "fmla v29.8h, v9.8h, v22.8h\n"
+    "ldr q19, [x16, #0x130]\n"
+    "fmla v30.8h, v24.8h, v16.8h\n"
+    "ldr q16, [x21, x10]\n"
+    "fmla v31.8h, v24.8h, v2.8h\n"
+    "fmla v28.8h, v24.8h, v18.8h\n"
+    "ldr q18, [x20, x10]\n"
+    "fmla v29.8h, v24.8h, v17.8h\n"
+    "ldr q0, [x16, #0x150]\n"
+    "fmla v30.8h, v23.8h, v2.8h\n"
+    "fmla v31.8h, v23.8h, v5.8h\n"
+    "ldp x21, x20, [x15, #0x0]\n"
+    "fmla v28.8h, v23.8h, v17.8h\n"
+    "ldr q17, [x23, x10]\n"
+    "fmla v29.8h, v23.8h, v16.8h\n"
+    "ldr q1, [x16, #0x160]\n"
+    "fmla v30.8h, v21.8h, v5.8h\n"
+    "ldr q5, [x21, x17]\n"
+    "fmla v31.8h, v21.8h, v25.8h\n"
+    "fmla v28.8h, v21.8h, v16.8h\n"
+    "ldr q16, [x22, x10]\n"
+    "fmla v29.8h, v21.8h, v18.8h\n"
+    "ldr q2, [x16, #0x170]\n"
+    "fmla v30.8h, v20.8h, v25.8h\n"
+    "ldr q6, [x20, x17]\n"
+    "fmla v31.8h, v20.8h, v10.8h\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr q7, [x21, x17]\n"
+    "fmla v28.8h, v20.8h, v18.8h\n"
+    "fmla v29.8h, v20.8h, v17.8h\n"
+    "ldr q3, [x16, #0x180]\n"
+    "fmla v30.8h, v19.8h, v10.8h\n"
+    "ldr q8, [x20, x17]\n"
+    "fmla v31.8h, v19.8h, v22.8h\n"
+    "ldp x21, x20, [x15, #0x20]\n"
+    "ldr q13, [x20, x17]\n"
+    "fmla v28.8h, v19.8h, v17.8h\n"
+    "fmla v29.8h, v19.8h, v16.8h\n"
+    "ldr q9, [x21, x17]\n"
+    "ldr q4, [x16, #0x190]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "fmax v30.8h, v30.8h, v27.8h\n"
+    "fmax v31.8h, v31.8h, v27.8h\n"
+    "ldr q11, [x21, x17]\n"
+    "ldr q12, [x20, x17]\n"
+    "fmax v28.8h, v28.8h, v27.8h\n"
+    "fmax v29.8h, v29.8h, v27.8h\n"
+    "ldp x21, x20, [x15, #0x40]\n"
+    "ldr q10, [x21, x17]\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "ldr q14, [x20, x17]\n"
+    "add x17, x17, #0x10\n"
+    "cmp x17, x9, LSL #4\n"
+    "fmin v28.8h, v28.8h, v15.8h\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "add x10, x10, #0x10\n"
+    "str q30, [x14, x28]\n"
+    "add x16, x16, #0x1a0\n"
+    "str q31, [x13, x28]\n"
+    "str q28, [x12, x28]\n"
+    "str q29, [x11, x28]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+    "mov v5.16b, v26.16b\n fmla v5.8h, v0.8h, v6.8h\n"
+    "ldr x20, [x15, #0x50]\n"
+    "ldr q22, [x20, x10]\n"
+    "mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+    "mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v8.8h\n"
+    "ldr q19, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x58]\n"
+    "fmla v31.8h, v1.8h, v6.8h\n"
+    "ldr q21, [x20, x10]\n"
+    "fmla v5.8h, v1.8h, v9.8h\n"
+    "ldr x21, [x15, #0x60]\n"
+    "fmla v30.8h, v1.8h, v8.8h\n"
+    "fmla v29.8h, v1.8h, v13.8h\n"
+    "ldr q18, [x16, #0x10]\n"
+    "ldr x20, [x15, #0x68]\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "ldr q16, [x21, x10]\n"
+    "fmla v5.8h, v2.8h, v11.8h\n"
+    "ldr x23, [x15, #0x70]\n"
+    "fmla v30.8h, v2.8h, v13.8h\n"
+    "fmla v29.8h, v2.8h, v22.8h\n"
+    "ldr q17, [x16, #0x20]\n"
+    "ldr x21, [x15, #0x78]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr q6, [x20, x10]\n"
+    "fmla v5.8h, v3.8h, v12.8h\n"
+    "ldr x22, [x15, #0x80]\n"
+    "fmla v30.8h, v3.8h, v22.8h\n"
+    "fmla v29.8h, v3.8h, v21.8h\n"
+    "ldr q20, [x16, #0x30]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "ldr q2, [x23, x10]\n"
+    "fmla v5.8h, v4.8h, v16.8h\n"
+    "ldr q28, [x21, x10]\n"
+    "fmla v30.8h, v4.8h, v21.8h\n"
+    "fmla v29.8h, v4.8h, v10.8h\n"
+    "ldr q16, [x16, #0x40]\n"
+    "ldr x21, [x15, #0x90]\n"
+    "fmla v31.8h, v19.8h, v7.8h\n"
+    "fmla v5.8h, v19.8h, v8.8h\n"
+    "ldr x27, [x15, #0x98]\n"
+    "ldr x26, [x15, #0xa0]\n"
+    "fmla v30.8h, v19.8h, v14.8h\n"
+    "fmla v29.8h, v19.8h, v6.8h\n"
+    "ldr q19, [x16, #0x50]\n"
+    "ldr x25, [x15, #0xa8]\n"
+    "fmla v31.8h, v18.8h, v8.8h\n"
+    "ldr q1, [x20, x10]\n"
+    "fmla v5.8h, v18.8h, v13.8h\n"
+    "ldr x24, [x15, #0xb0]\n"
+    "fmla v30.8h, v18.8h, v6.8h\n"
+    "fmla v29.8h, v18.8h, v2.8h\n"
+    "ldr q18, [x16, #0x60]\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v31.8h, v17.8h, v13.8h\n"
+    "ldr q26, [x22, x10]\n"
+    "fmla v5.8h, v17.8h, v22.8h\n"
+    "ldr x23, [x15, #0xc0]\n"
+    "fmla v30.8h, v17.8h, v2.8h\n"
+    "fmla v29.8h, v17.8h, v28.8h\n"
+    "ldr q17, [x16, #0x70]\n"
+    "ldr x22, [x15, #0xc8]\n"
+    "fmla v31.8h, v20.8h, v22.8h\n"
+    "ldr q25, [x21, x10]\n"
+    "fmla v5.8h, v20.8h, v21.8h\n"
+    "ldr x21, [x15, #0xd0]\n"
+    "fmla v30.8h, v20.8h, v28.8h\n"
+    "fmla v29.8h, v20.8h, v26.8h\n"
+    "ldr q24, [x16, #0x80]\n"
+    "add x28, x28, #0x10\n"
+    "fmla v31.8h, v16.8h, v21.8h\n"
+    "ldr q23, [x27, x10]\n"
+    "fmla v5.8h, v16.8h, v10.8h\n"
+    "ldr q0, [x26, x10]\n"
+    "fmla v30.8h, v16.8h, v26.8h\n"
+    "fmla v29.8h, v16.8h, v1.8h\n"
+    "ldr q22, [x16, #0x90]\n"
+    "ldr x27, [x15, #0xd8]\n"
+    "fmla v31.8h, v19.8h, v14.8h\n"
+    "ldr q16, [x20, x10]\n"
+    "fmla v5.8h, v19.8h, v6.8h\n"
+    "ldr x20, [x15, #0xe0]\n"
+    "fmla v30.8h, v19.8h, v25.8h\n"
+    "fmla v29.8h, v19.8h, v23.8h\n"
+    "ldr q21, [x16, #0xa0]\n"
+    "ldr x26, [x15, #0xf8]\n"
+    "fmla v31.8h, v18.8h, v6.8h\n"
+    "ldr q20, [x25, x10]\n"
+    "fmla v5.8h, v18.8h, v2.8h\n"
+    "ldr x25, [x15, #0xe8]\n"
+    "fmla v30.8h, v18.8h, v23.8h\n"
+    "fmla v29.8h, v18.8h, v0.8h\n"
+    "ldr q18, [x16, #0xb0]\n"
+    "fmla v31.8h, v17.8h, v2.8h\n"
+    "ldr q19, [x24, x10]\n"
+    "fmla v5.8h, v17.8h, v28.8h\n"
+    "ldr x24, [x15, #0xf0]\n"
+    "fmla v30.8h, v17.8h, v0.8h\n"
+    "fmla v29.8h, v17.8h, v20.8h\n"
+    "ldr q17, [x16, #0xc0]\n"
+    "fmla v31.8h, v24.8h, v28.8h\n"
+    "ldr q7, [x23, x10]\n"
+    "fmla v5.8h, v24.8h, v26.8h\n"
+    "ldr x23, [x15, #0x100]\n"
+    "fmla v30.8h, v24.8h, v20.8h\n"
+    "fmla v29.8h, v24.8h, v19.8h\n"
+    "ldr q3, [x16, #0xd0]\n"
+    "fmla v31.8h, v22.8h, v26.8h\n"
+    "ldr q28, [x22, x10]\n"
+    "fmla v5.8h, v22.8h, v1.8h\n"
+    "ldr q13, [x20, x10]\n"
+    "fmla v30.8h, v22.8h, v19.8h\n"
+    "fmla v29.8h, v22.8h, v16.8h\n"
+    "ldr q11, [x16, #0xe0]\n"
+    "ldr x22, [x15, #0x108]\n"
+    "fmla v31.8h, v21.8h, v25.8h\n"
+    "ldr q26, [x21, x10]\n"
+    "fmla v5.8h, v21.8h, v23.8h\n"
+    "ldr x21, [x15, #0x110]\n"
+    "fmla v30.8h, v21.8h, v7.8h\n"
+    "fmla v29.8h, v21.8h, v28.8h\n"
+    "ldr q25, [x16, #0xf0]\n"
+    "fmla v31.8h, v18.8h, v23.8h\n"
+    "ldr q24, [x27, x10]\n"
+    "fmla v5.8h, v18.8h, v0.8h\n"
+    "ldr x20, [x15, #0x118]\n"
+    "fmla v30.8h, v18.8h, v28.8h\n"
+    "fmla v29.8h, v18.8h, v26.8h\n"
+    "ldr q23, [x16, #0x100]\n"
+    "fmla v31.8h, v17.8h, v0.8h\n"
+    "ldr q22, [x25, x10]\n"
+    "fmla v5.8h, v17.8h, v20.8h\n"
+    "fmla v30.8h, v17.8h, v26.8h\n"
+    "fmla v29.8h, v17.8h, v24.8h\n"
+    "ldr q21, [x16, #0x110]\n"
+    "fmla v31.8h, v3.8h, v20.8h\n"
+    "ldr q18, [x24, x10]\n"
+    "fmla v5.8h, v3.8h, v19.8h\n"
+    "fmla v30.8h, v3.8h, v24.8h\n"
+    "fmla v29.8h, v3.8h, v13.8h\n"
+    "ldr q20, [x16, #0x120]\n"
+    "fmla v31.8h, v11.8h, v19.8h\n"
+    "ldr q17, [x26, x10]\n"
+    "fmla v5.8h, v11.8h, v16.8h\n"
+    "fmla v30.8h, v11.8h, v13.8h\n"
+    "fmla v29.8h, v11.8h, v22.8h\n"
+    "ldr q19, [x16, #0x130]\n"
+    "add x16, x16, #0x140\n"
+    "fmla v31.8h, v25.8h, v7.8h\n"
+    "ldr q16, [x23, x10]\n"
+    "fmla v5.8h, v25.8h, v28.8h\n"
+    "fmla v30.8h, v25.8h, v18.8h\n"
+    "ldr q18, [x22, x10]\n"
+    "fmla v29.8h, v25.8h, v17.8h\n"
+    "fmla v31.8h, v23.8h, v28.8h\n"
+    "fmla v5.8h, v23.8h, v26.8h\n"
+    "fmla v30.8h, v23.8h, v17.8h\n"
+    "ldr q17, [x21, x10]\n"
+    "fmla v29.8h, v23.8h, v16.8h\n"
+    "fmla v31.8h, v21.8h, v26.8h\n"
+    "fmla v5.8h, v21.8h, v24.8h\n"
+    "fmla v30.8h, v21.8h, v16.8h\n"
+    "ldr q16, [x20, x10]\n"
+    "fmla v29.8h, v21.8h, v18.8h\n"
+    "add x10, x10, #0x10\n"
+    "fmla v31.8h, v20.8h, v24.8h\n"
+    "fmla v5.8h, v20.8h, v13.8h\n"
+    "fmla v30.8h, v20.8h, v18.8h\n"
+    "fmla v29.8h, v20.8h, v17.8h\n"
+    "fmla v31.8h, v19.8h, v13.8h\n"
+    "fmla v5.8h, v19.8h, v22.8h\n"
+    "fmax v31.8h, v31.8h, v27.8h\n"
+    "fmla v30.8h, v19.8h, v17.8h\n"
+    "fmla v29.8h, v19.8h, v16.8h\n"
+    "fmax v5.8h, v5.8h, v27.8h\n"
+    "fmax v30.8h, v30.8h, v27.8h\n"
+    "fmax v29.8h, v29.8h, v27.8h\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "fmin v5.8h, v5.8h, v15.8h\n"
+    "str q31, [x14, x28]\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "str q5, [x13, x28]\n"
+    "str q30, [x12, x28]\n"
+    "str q29, [x11, x28]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 116f\n"
+    "ldr q26, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "mov x20, x10\n"
+    "add x14, x14, x20\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "add x13, x13, x20\n"
+    "add x12, x12, x20\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "add x11, x11, x20\n"
+    "ldr x9, [x15, #0x0]\n"
+    "ldr x28, [x15, #0x8]\n"
+    "add x9, x9, x10\n"
+    "add x28, x28, x10\n"
+    "ldr x27, [x15, #0x10]\n"
+    "ldr x26, [x15, #0x18]\n"
+    "add x27, x27, x10\n"
+    "add x26, x26, x10\n"
+    "ldr x25, [x15, #0x20]\n"
+    "ldr x24, [x15, #0x28]\n"
+    "add x25, x25, x10\n"
+    "add x24, x24, x10\n"
+    "ldr x23, [x15, #0x30]\n"
+    "ldr x22, [x15, #0x38]\n"
+    "add x23, x23, x10\n"
+    "add x22, x22, x10\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "add x21, x21, x10\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x60\n"
+    "tbz %x[n_channels], #2, 5f\n"
+    "ld1 { v5.d }[0], [x9], #0x8\n"
+    "ld1 { v6.d }[0], [x28], #0x8\n"
+    "ld1 { v7.d }[0], [x27], #0x8\n"
+    "ld1 { v8.d }[0], [x26], #0x8\n"
+    "ld1 { v9.d }[0], [x25], #0x8\n"
+    "ld1 { v13.d }[0], [x24], #0x8\n"
+    "ld1 { v11.d }[0], [x23], #0x8\n"
+    "ld1 { v12.d }[0], [x22], #0x8\n"
+    "ld1 { v10.d }[0], [x21], #0x8\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v5.s }[2], [x9], #0x4\n"
+    "ld1 { v6.s }[2], [x28], #0x4\n"
+    "ld1 { v7.s }[2], [x27], #0x4\n"
+    "ld1 { v8.s }[2], [x26], #0x4\n"
+    "ld1 { v9.s }[2], [x25], #0x4\n"
+    "ld1 { v13.s }[2], [x24], #0x4\n"
+    "ld1 { v11.s }[2], [x23], #0x4\n"
+    "ld1 { v12.s }[2], [x22], #0x4\n"
+    "ld1 { v10.s }[2], [x21], #0x4\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v5.h }[6], [x9], #0x2\n"
+    "ld1 { v6.h }[6], [x28], #0x2\n"
+    "ld1 { v7.h }[6], [x27], #0x2\n"
+    "ld1 { v8.h }[6], [x26], #0x2\n"
+    "ld1 { v9.h }[6], [x25], #0x2\n"
+    "ld1 { v13.h }[6], [x24], #0x2\n"
+    "ld1 { v11.h }[6], [x23], #0x2\n"
+    "ld1 { v12.h }[6], [x22], #0x2\n"
+    "ld1 { v10.h }[6], [x21], #0x2\n"
+    "ld1 { v14.h }[6], [x20], #0x2\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v5.h }[4], [x9], #0x2\n"
+    "ld1 { v6.h }[4], [x28], #0x2\n"
+    "ld1 { v7.h }[4], [x27], #0x2\n"
+    "ld1 { v8.h }[4], [x26], #0x2\n"
+    "ld1 { v9.h }[4], [x25], #0x2\n"
+    "ld1 { v13.h }[4], [x24], #0x2\n"
+    "ld1 { v11.h }[4], [x23], #0x2\n"
+    "ld1 { v12.h }[4], [x22], #0x2\n"
+    "ld1 { v10.h }[4], [x21], #0x2\n"
+    "ld1 { v14.h }[4], [x20], #0x2\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v5.s }[0], [x9], #0x4\n"
+    "ld1 { v6.s }[0], [x28], #0x4\n"
+    "ld1 { v7.s }[0], [x27], #0x4\n"
+    "ld1 { v8.s }[0], [x26], #0x4\n"
+    "ld1 { v9.s }[0], [x25], #0x4\n"
+    "ld1 { v13.s }[0], [x24], #0x4\n"
+    "ld1 { v11.s }[0], [x23], #0x4\n"
+    "ld1 { v12.s }[0], [x22], #0x4\n"
+    "ld1 { v10.s }[0], [x21], #0x4\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v5.h }[2], [x9], #0x2\n"
+    "ld1 { v6.h }[2], [x28], #0x2\n"
+    "ld1 { v7.h }[2], [x27], #0x2\n"
+    "ld1 { v8.h }[2], [x26], #0x2\n"
+    "ld1 { v9.h }[2], [x25], #0x2\n"
+    "ld1 { v13.h }[2], [x24], #0x2\n"
+    "ld1 { v11.h }[2], [x23], #0x2\n"
+    "ld1 { v12.h }[2], [x22], #0x2\n"
+    "ld1 { v10.h }[2], [x21], #0x2\n"
+    "ld1 { v14.h }[2], [x20], #0x2\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v5.h }[0], [x9], #0x2\n"
+    "ld1 { v6.h }[0], [x28], #0x2\n"
+    "ld1 { v7.h }[0], [x27], #0x2\n"
+    "ld1 { v8.h }[0], [x26], #0x2\n"
+    "ld1 { v9.h }[0], [x25], #0x2\n"
+    "ld1 { v13.h }[0], [x24], #0x2\n"
+    "ld1 { v11.h }[0], [x23], #0x2\n"
+    "ld1 { v12.h }[0], [x22], #0x2\n"
+    "ld1 { v10.h }[0], [x21], #0x2\n"
+    "ld1 { v14.h }[0], [x20], #0x2\n"
+    "7:"  // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: End
+    "mov v28.16b, v26.16b\n fmla v28.8h, v0.8h, v5.8h\n"
+    "mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v6.8h\n"
+    "ldr x20, [x15, #0x50]\n"
+    "add x20, x20, x10\n"
+    "mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+    "mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v8.8h\n"
+    "fmla v28.8h, v1.8h, v6.8h\n"
+    "fmla v29.8h, v1.8h, v9.8h\n"
+    "fmla v30.8h, v1.8h, v8.8h\n"
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "fmla v30.8h, v2.8h, v13.8h\n"
+    "tbz %x[n_channels], #2, 9f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v5.h }[6], [x20], #0x2\n"
+    "b 11f\n"
+    "8:"  // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v5.h }[4], [x20], #0x2\n"
+    "b 11f\n"
+    "9:"  // Oddments: Load input (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "11:"  // Oddments: Load input (1, 3): Bit 2: End
+    "ldr x20, [x15, #0x58]\n"
+    "fmla v31.8h, v2.8h, v5.8h\n"
+    "fmla v28.8h, v3.8h, v11.8h\n"
+    "add x20, x20, x10\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v5.8h\n"
+    "tbz %x[n_channels], #2, 13f\n"
+    "ld1 { v6.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v6.h }[6], [x20], #0x2\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load input (1, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v6.h }[4], [x20], #0x2\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load input (1, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "15:"  // Oddments: Load input (1, 4): Bit 2: End
+    "ldr x20, [x15, #0x60]\n"
+    "fmla v31.8h, v3.8h, v6.8h\n"
+    "fmla v28.8h, v4.8h, v12.8h\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #2, 17f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load input (0, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load input (0, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (0, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "19:"  // Oddments: Load input (0, 5): Bit 2: End
+    "ldr q0, [x16, #0x0]\n"
+    "fmla v29.8h, v4.8h, v9.8h\n"
+    "fmla v30.8h, v4.8h, v6.8h\n"
+    "ldr x20, [x15, #0x68]\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "fmla v28.8h, v0.8h, v7.8h\n"
+    "add x20, x20, x10\n"
+    "fmla v29.8h, v0.8h, v8.8h\n"
+    "fmla v30.8h, v0.8h, v14.8h\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 21f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load input (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load input (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "23:"  // Oddments: Load input (2, 1): Bit 2: End
+    "ldr q1, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "fmla v31.8h, v0.8h, v11.8h\n"
+    "fmla v28.8h, v1.8h, v8.8h\n"
+    "fmla v29.8h, v1.8h, v13.8h\n"
+    "fmla v30.8h, v1.8h, v11.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 25f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load input (2, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load input (2, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (2, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "27:"  // Oddments: Load input (2, 2): Bit 2: End
+    "ldr q2, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "fmla v29.8h, v2.8h, v5.8h\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 29f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load input (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load input (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "31:"  // Oddments: Load input (2, 3): Bit 2: End
+    "ldr q3, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x80]\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "fmla v28.8h, v3.8h, v5.8h\n"
+    "fmla v29.8h, v3.8h, v6.8h\n"
+    "fmla v30.8h, v3.8h, v9.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 33f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load input (2, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load input (2, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "35:"  // Oddments: Load input (2, 4): Bit 2: End
+    "ldr q4, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla v31.8h, v3.8h, v13.8h\n"
+    "fmla v28.8h, v4.8h, v6.8h\n"
+    "fmla v29.8h, v4.8h, v10.8h\n"
+    "fmla v30.8h, v4.8h, v13.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 37f\n"
+    "ld1 { v8.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v8.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v8.h }[6], [x20], #0x2\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load input (2, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v8.h }[4], [x20], #0x2\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load input (2, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (2, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "39:"  // Oddments: Load input (2, 5): Bit 2: End
+    "ldr q0, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x90]\n"
+    "fmla v31.8h, v4.8h, v8.8h\n"
+    "fmla v28.8h, v0.8h, v14.8h\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 41f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v5.h }[6], [x20], #0x2\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v5.h }[4], [x20], #0x2\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load input (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "43:"  // Oddments: Load input (3, 0): Bit 2: End
+    "ldr x20, [x15, #0x98]\n"
+    "fmla v30.8h, v0.8h, v5.8h\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #2, 45f\n"
+    "ld1 { v6.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v6.h }[6], [x20], #0x2\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v6.h }[4], [x20], #0x2\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load input (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "47:"  // Oddments: Load input (3, 1): Bit 2: End
+    "ldr q1, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xa0]\n"
+    "fmla v31.8h, v0.8h, v6.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "fmla v30.8h, v1.8h, v6.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 49f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load input (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "51:"  // Oddments: Load input (3, 2): Bit 2: End
+    "ldr q2, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "fmla v31.8h, v1.8h, v10.8h\n"
+    "fmla v28.8h, v2.8h, v12.8h\n"
+    "fmla v29.8h, v2.8h, v9.8h\n"
+    "fmla v30.8h, v2.8h, v10.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 53f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load input (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 54f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "55:"  // Oddments: Load input (3, 3): Bit 2: End
+    "ldr q3, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v9.8h\n"
+    "fmla v29.8h, v3.8h, v13.8h\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 57f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 56f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load input (3, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load input (3, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 58f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "59:"  // Oddments: Load input (3, 4): Bit 2: End
+    "ldr q4, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v31.8h, v3.8h, v12.8h\n"
+    "fmla v28.8h, v4.8h, v13.8h\n"
+    "fmla v29.8h, v4.8h, v8.8h\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 61f\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 60f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v14.h }[6], [x20], #0x2\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load input (3, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v14.h }[4], [x20], #0x2\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load input (3, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 62f\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v14.h }[2], [x20], #0x2\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load input (3, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v14.h }[0], [x20], #0x2\n"
+    "63:"  // Oddments: Load input (3, 5): Bit 2: End
+    "ldr q0, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla v31.8h, v4.8h, v14.8h\n"
+    "fmla v28.8h, v0.8h, v5.8h\n"
+    "fmla v29.8h, v0.8h, v6.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 65f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 64f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load input (4, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load input (4, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 66f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load input (4, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "67:"  // Oddments: Load input (4, 0): Bit 2: End
+    "ldr x20, [x15, #0xc8]\n"
+    "fmla v30.8h, v0.8h, v9.8h\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #2, 69f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 68f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load input (4, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load input (4, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 70f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "71:"  // Oddments: Load input (4, 1): Bit 2: End
+    "ldr q1, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xd0]\n"
+    "fmla v31.8h, v0.8h, v13.8h\n"
+    "fmla v28.8h, v1.8h, v6.8h\n"
+    "fmla v29.8h, v1.8h, v10.8h\n"
+    "fmla v30.8h, v1.8h, v13.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 73f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 72f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v5.h }[6], [x20], #0x2\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load input (4, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v5.h }[4], [x20], #0x2\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load input (4, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 74f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "75:"  // Oddments: Load input (4, 2): Bit 2: End
+    "ldr q2, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xd8]\n"
+    "fmla v31.8h, v1.8h, v5.8h\n"
+    "fmla v28.8h, v2.8h, v10.8h\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "fmla v30.8h, v2.8h, v5.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 77f\n"
+    "ld1 { v6.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 76f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v6.h }[6], [x20], #0x2\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load input (4, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v6.h }[4], [x20], #0x2\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load input (4, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 78f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "79:"  // Oddments: Load input (4, 3): Bit 2: End
+    "ldr q3, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xe0]\n"
+    "fmla v31.8h, v2.8h, v6.8h\n"
+    "fmla v28.8h, v3.8h, v11.8h\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v6.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 81f\n"
+    "ld1 { v8.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 80f\n"
+    "ld1 { v8.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v8.h }[6], [x20], #0x2\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load input (4, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v8.h }[4], [x20], #0x2\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load input (4, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 82f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "83:"  // Oddments: Load input (4, 4): Bit 2: End
+    "ldr q4, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xe8]\n"
+    "fmla v31.8h, v3.8h, v8.8h\n"
+    "fmla v28.8h, v4.8h, v12.8h\n"
+    "fmla v29.8h, v4.8h, v14.8h\n"
+    "fmla v30.8h, v4.8h, v8.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 85f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 84f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 87f\n"
+    "84:"  // Oddments: Load input (4, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 87f\n"
+    "85:"  // Oddments: Load input (4, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 86f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 87f\n"
+    "86:"  // Oddments: Load input (4, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "87:"  // Oddments: Load input (4, 5): Bit 2: End
+    "ldr q0, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xf0]\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "fmla v28.8h, v0.8h, v9.8h\n"
+    "fmla v29.8h, v0.8h, v13.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 89f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 88f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 91f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 91f\n"
+    "88:"  // Oddments: Load input (5, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 91f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 91f\n"
+    "89:"  // Oddments: Load input (5, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 90f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 91f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 91f\n"
+    "90:"  // Oddments: Load input (5, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "91:"  // Oddments: Load input (5, 0): Bit 2: End
+    "ldr x20, [x15, #0xf8]\n"
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #2, 93f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 92f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 95f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 95f\n"
+    "92:"  // Oddments: Load input (5, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 95f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 95f\n"
+    "93:"  // Oddments: Load input (5, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 94f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 95f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 95f\n"
+    "94:"  // Oddments: Load input (5, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "95:"  // Oddments: Load input (5, 1): Bit 2: End
+    "ldr q1, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x100]\n"
+    "fmla v31.8h, v0.8h, v12.8h\n"
+    "fmla v28.8h, v1.8h, v13.8h\n"
+    "fmla v29.8h, v1.8h, v5.8h\n"
+    "fmla v30.8h, v1.8h, v12.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 97f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 96f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 99f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 99f\n"
+    "96:"  // Oddments: Load input (5, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 99f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 99f\n"
+    "97:"  // Oddments: Load input (5, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 98f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 99f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 99f\n"
+    "98:"  // Oddments: Load input (5, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "99:"  // Oddments: Load input (5, 2): Bit 2: End
+    "ldr q2, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x108]\n"
+    "fmla v31.8h, v1.8h, v9.8h\n"
+    "fmla v28.8h, v2.8h, v5.8h\n"
+    "fmla v29.8h, v2.8h, v6.8h\n"
+    "fmla v30.8h, v2.8h, v9.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 101f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 100f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 103f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 103f\n"
+    "100:"  // Oddments: Load input (5, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 103f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 103f\n"
+    "101:"  // Oddments: Load input (5, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 102f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 103f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 103f\n"
+    "102:"  // Oddments: Load input (5, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "103:"  // Oddments: Load input (5, 3): Bit 2: End
+    "ldr q3, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x110]\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v6.8h\n"
+    "fmla v29.8h, v3.8h, v8.8h\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 105f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 104f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 107f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 107f\n"
+    "104:"  // Oddments: Load input (5, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 107f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 107f\n"
+    "105:"  // Oddments: Load input (5, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 106f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 107f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 107f\n"
+    "106:"  // Oddments: Load input (5, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "107:"  // Oddments: Load input (5, 4): Bit 2: End
+    "ldr q4, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x118]\n"
+    "fmla v31.8h, v3.8h, v12.8h\n"
+    "fmla v28.8h, v4.8h, v8.8h\n"
+    "fmla v29.8h, v4.8h, v10.8h\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #2, 109f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 108f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 111f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 111f\n"
+    "108:"  // Oddments: Load input (5, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 111f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 111f\n"
+    "109:"  // Oddments: Load input (5, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 110f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 111f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 111f\n"
+    "110:"  // Oddments: Load input (5, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "111:"  // Oddments: Load input (5, 5): Bit 2: End
+    "fmla v31.8h, v4.8h, v9.8h\n"
+    "fmax v28.8h, v28.8h, v27.8h\n"
+    "fmax v29.8h, v29.8h, v27.8h\n"
+    "fmax v30.8h, v30.8h, v27.8h\n"
+    "fmax v31.8h, v31.8h, v27.8h\n"
+    "fmin v28.8h, v28.8h, v15.8h\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "tbz %x[n_channels], #2, 113f\n"
+    "st1 { v28.d }[0], [x14], #0x8\n"
+    "st1 { v29.d }[0], [x13], #0x8\n"
+    "st1 { v30.d }[0], [x12], #0x8\n"
+    "st1 { v31.d }[0], [x11], #0x8\n"
+    "tbz %x[n_channels], #1, 112f\n"
+    "st1 { v28.s }[2], [x14], #0x4\n"
+    "st1 { v29.s }[2], [x13], #0x4\n"
+    "st1 { v30.s }[2], [x12], #0x4\n"
+    "st1 { v31.s }[2], [x11], #0x4\n"
+    "tbz %x[n_channels], #0, 115f\n"
+    "st1 { v28.h }[6], [x14], #0x2\n"
+    "st1 { v29.h }[6], [x13], #0x2\n"
+    "st1 { v30.h }[6], [x12], #0x2\n"
+    "st1 { v31.h }[6], [x11], #0x2\n"
+    "b 115f\n"
+    "112:"  // Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 115f\n"
+    "st1 { v28.h }[4], [x14], #0x2\n"
+    "st1 { v29.h }[4], [x13], #0x2\n"
+    "st1 { v30.h }[4], [x12], #0x2\n"
+    "st1 { v31.h }[4], [x11], #0x2\n"
+    "b 115f\n"
+    "113:"  // Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 114f\n"
+    "st1 { v28.s }[0], [x14], #0x4\n"
+    "st1 { v29.s }[0], [x13], #0x4\n"
+    "st1 { v30.s }[0], [x12], #0x4\n"
+    "st1 { v31.s }[0], [x11], #0x4\n"
+    "tbz %x[n_channels], #0, 115f\n"
+    "st1 { v28.h }[2], [x14], #0x2\n"
+    "st1 { v29.h }[2], [x13], #0x2\n"
+    "st1 { v30.h }[2], [x12], #0x2\n"
+    "st1 { v31.h }[2], [x11], #0x2\n"
+    "b 115f\n"
+    "114:"  // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "st1 { v28.h }[0], [x14], #0x2\n"
+    "st1 { v29.h }[0], [x13], #0x2\n"
+    "st1 { v30.h }[0], [x12], #0x2\n"
+    "st1 { v31.h }[0], [x11], #0x2\n"
+    "115:"  // Oddments: Store: Bit 2: End
+    "116:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..b7608af721
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(const __fp16 *const *const, __fp16 *const *const, const void *, const void *, const unsigned int, const unsigned int, const __fp16, const __fp16);
+
+class a64_fp16_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  KernelType kernel = a64_fp16_nhwc_generic_output9_mla_depthfirst_impl;
+
+  public:
+  a64_fp16_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<__fp16, __fp16, __fp16, __fp16>(9, arm_gemm::VLType::None) {}
+
+  KernelType get_kernel() const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..08f40b785f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,520 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
+  const __fp16 *const *const inptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  const void *bias,
+  const unsigned int n_points,
+  const unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  const __fp16 minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ld1r { v2.8h }, [%x[minmax_vals]]\n"
+    "lsr x9, %x[n_channels], #0x3\n"
+    "add x20, %x[minmax_vals], #0x2\n"
+    "ld1r { v1.8h }, [x20]\n"
+    "mov x11, #0x0\n"
+    "cbz x9, 5f\n"
+    "1:"  // Channel loop
+    "movi v23.16b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ldr q23, [%x[bias], x11]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "ldr q0, [%x[params], #0x0]\n"
+    "mov x26, %x[inptrs]\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "subs x25, %x[n_points], #0x1\n"
+    "ldr q14, [x21, x11]\n"
+    "ldr q15, [x20, x11]\n"
+    "mov v24.16b, v23.16b\n"
+    "mov v25.16b, v23.16b\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "ldr q16, [x21, x11]\n"
+    "mov v26.16b, v23.16b\n"
+    "mov v27.16b, v23.16b\n"
+    "ldr q17, [x20, x11]\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "ldr q18, [x21, x11]\n"
+    "ldr q19, [x20, x11]\n"
+    "mov v30.16b, v23.16b\n"
+    "mov v31.16b, v23.16b\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "ldr q20, [x21, x11]\n"
+    "add %x[params], %x[params], #0x10\n"
+    "ldr q21, [x20, x11]\n"
+    "ldr x20, [x26], #0x8\n"
+    "ldr q22, [x20, x11]\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "ldp x20, x24, [x26], #0x10\n"
+    "ldp x23, x22, [x26], #0x10\n"
+    "subs x25, x25, #0x1\n"
+    "fmla v23.8h, v14.8h, v0.8h\n"
+    "ldr q14, [x20, x11]\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "fmla v24.8h, v15.8h, v0.8h\n"
+    "fmla v25.8h, v16.8h, v0.8h\n"
+    "ldr q15, [x24, x11]\n"
+    "ldr q16, [x23, x11]\n"
+    "fmla v26.8h, v17.8h, v0.8h\n"
+    "fmla v27.8h, v18.8h, v0.8h\n"
+    "ldr q17, [x22, x11]\n"
+    "ldr q18, [x21, x11]\n"
+    "fmla v28.8h, v19.8h, v0.8h\n"
+    "fmla v29.8h, v20.8h, v0.8h\n"
+    "ldr q19, [x20, x11]\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "fmla v30.8h, v21.8h, v0.8h\n"
+    "fmla v31.8h, v22.8h, v0.8h\n"
+    "ldr q0, [%x[params], #0x0]\n"
+    "ldr q20, [x21, x11]\n"
+    "add %x[params], %x[params], #0x10\n"
+    "ldr q21, [x20, x11]\n"
+    "ldr x20, [x26], #0x8\n"
+    "ldr q22, [x20, x11]\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "fmla v23.8h, v14.8h, v0.8h\n"
+    "fmla v24.8h, v15.8h, v0.8h\n"
+    "fmax v23.8h, v23.8h, v2.8h\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "fmla v25.8h, v16.8h, v0.8h\n"
+    "fmla v26.8h, v17.8h, v0.8h\n"
+    "fmax v24.8h, v24.8h, v2.8h\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "fmla v27.8h, v18.8h, v0.8h\n"
+    "fmla v28.8h, v19.8h, v0.8h\n"
+    "fmax v25.8h, v25.8h, v2.8h\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "fmla v29.8h, v20.8h, v0.8h\n"
+    "fmla v30.8h, v21.8h, v0.8h\n"
+    "fmax v26.8h, v26.8h, v2.8h\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "fmla v31.8h, v22.8h, v0.8h\n"
+    "fmax v27.8h, v27.8h, v2.8h\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "fmax v28.8h, v28.8h, v2.8h\n"
+    "fmax v29.8h, v29.8h, v2.8h\n"
+    "fmax v30.8h, v30.8h, v2.8h\n"
+    "fmax v31.8h, v31.8h, v2.8h\n"
+    "fmin v23.8h, v23.8h, v1.8h\n"
+    "fmin v24.8h, v24.8h, v1.8h\n"
+    "str q23, [x28, x11]\n"
+    "fmin v25.8h, v25.8h, v1.8h\n"
+    "fmin v26.8h, v26.8h, v1.8h\n"
+    "str q24, [x27, x11]\n"
+    "fmin v27.8h, v27.8h, v1.8h\n"
+    "fmin v28.8h, v28.8h, v1.8h\n"
+    "str q25, [x26, x11]\n"
+    "fmin v29.8h, v29.8h, v1.8h\n"
+    "fmin v30.8h, v30.8h, v1.8h\n"
+    "str q26, [x25, x11]\n"
+    "fmin v31.8h, v31.8h, v1.8h\n"
+    "str q27, [x24, x11]\n"
+    "str q28, [x23, x11]\n"
+    "str q29, [x22, x11]\n"
+    "str q30, [x21, x11]\n"
+    "str q31, [x20, x11]\n"
+    "add x11, x11, #0x10\n"
+    "cmp x11, x9, LSL #4\n"
+    "blt 1b\n"
+    "5:"  // Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 25f\n"
+    "movi v23.16b, #0x0\n"
+    "cbz %x[bias], 10f\n"
+    "add x20, %x[bias], x11\n"
+    "tbz %x[n_channels], #2, 7f\n"
+    "ld1 { v23.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v23.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v23.h }[6], [x20], #0x2\n"
+    "b 9f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v23.h }[4], [x20], #0x2\n"
+    "b 9f\n"
+    "7:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "9:"  // Oddments: Load bias: Bit 2: End
+    "10:"  // Oddments: Load bias: Done
+    "ldr q0, [%x[params], #0x0]\n"
+    "mov x10, %x[inptrs]\n"
+    "ldp x9, x28, [x10], #0x10\n"
+    "mov v24.16b, v23.16b\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "mov v25.16b, v23.16b\n"
+    "mov v26.16b, v23.16b\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "ldr x21, [x10], #0x8\n"
+    "mov v27.16b, v23.16b\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "mov v30.16b, v23.16b\n"
+    "add x9, x9, x11\n"
+    "add x28, x28, x11\n"
+    "mov v31.16b, v23.16b\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "add %x[params], %x[params], #0x10\n"
+    "tbz %x[n_channels], #2, 12f\n"
+    "ldr d14, [x9], #0x8\n"
+    "ldr d15, [x28], #0x8\n"
+    "ldr d16, [x27], #0x8\n"
+    "ldr d17, [x26], #0x8\n"
+    "ldr d18, [x25], #0x8\n"
+    "ldr d19, [x24], #0x8\n"
+    "ldr d20, [x23], #0x8\n"
+    "ldr d21, [x22], #0x8\n"
+    "ldr d22, [x21], #0x8\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ld1 { v14.s }[2], [x9], #0x4\n"
+    "ld1 { v15.s }[2], [x28], #0x4\n"
+    "ld1 { v16.s }[2], [x27], #0x4\n"
+    "ld1 { v17.s }[2], [x26], #0x4\n"
+    "ld1 { v18.s }[2], [x25], #0x4\n"
+    "ld1 { v19.s }[2], [x24], #0x4\n"
+    "ld1 { v20.s }[2], [x23], #0x4\n"
+    "ld1 { v21.s }[2], [x22], #0x4\n"
+    "ld1 { v22.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v14.h }[6], [x9], #0x2\n"
+    "ld1 { v15.h }[6], [x28], #0x2\n"
+    "ld1 { v16.h }[6], [x27], #0x2\n"
+    "ld1 { v17.h }[6], [x26], #0x2\n"
+    "ld1 { v18.h }[6], [x25], #0x2\n"
+    "ld1 { v19.h }[6], [x24], #0x2\n"
+    "ld1 { v20.h }[6], [x23], #0x2\n"
+    "ld1 { v21.h }[6], [x22], #0x2\n"
+    "ld1 { v22.h }[6], [x21], #0x2\n"
+    "b 14f\n"
+    "11:"  // Oddments: Load: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v14.h }[4], [x9], #0x2\n"
+    "ld1 { v15.h }[4], [x28], #0x2\n"
+    "ld1 { v16.h }[4], [x27], #0x2\n"
+    "ld1 { v17.h }[4], [x26], #0x2\n"
+    "ld1 { v18.h }[4], [x25], #0x2\n"
+    "ld1 { v19.h }[4], [x24], #0x2\n"
+    "ld1 { v20.h }[4], [x23], #0x2\n"
+    "ld1 { v21.h }[4], [x22], #0x2\n"
+    "ld1 { v22.h }[4], [x21], #0x2\n"
+    "b 14f\n"
+    "12:"  // Oddments: Load: Bit 2: Unset
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr s14, [x9], #0x4\n"
+    "ldr s15, [x28], #0x4\n"
+    "ldr s16, [x27], #0x4\n"
+    "ldr s17, [x26], #0x4\n"
+    "ldr s18, [x25], #0x4\n"
+    "ldr s19, [x24], #0x4\n"
+    "ldr s20, [x23], #0x4\n"
+    "ldr s21, [x22], #0x4\n"
+    "ldr s22, [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v14.h }[2], [x9], #0x2\n"
+    "ld1 { v15.h }[2], [x28], #0x2\n"
+    "ld1 { v16.h }[2], [x27], #0x2\n"
+    "ld1 { v17.h }[2], [x26], #0x2\n"
+    "ld1 { v18.h }[2], [x25], #0x2\n"
+    "ld1 { v19.h }[2], [x24], #0x2\n"
+    "ld1 { v20.h }[2], [x23], #0x2\n"
+    "ld1 { v21.h }[2], [x22], #0x2\n"
+    "ld1 { v22.h }[2], [x21], #0x2\n"
+    "b 14f\n"
+    "13:"  // Oddments: Load: Bit 2: Unset: Bit 1: Unset
+    "ldr h14, [x9], #0x2\n"
+    "ldr h15, [x28], #0x2\n"
+    "ldr h16, [x27], #0x2\n"
+    "ldr h17, [x26], #0x2\n"
+    "ldr h18, [x25], #0x2\n"
+    "ldr h19, [x24], #0x2\n"
+    "ldr h20, [x23], #0x2\n"
+    "ldr h21, [x22], #0x2\n"
+    "ldr h22, [x21], #0x2\n"
+    "14:"  // Oddments: Load: Bit 2: End
+    "subs x20, %x[n_points], #0x1\n"
+    "ble 20f\n"
+    "15:"  // Oddments: Planar loop
+    "ldp x9, x28, [x10], #0x10\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "fmla v23.8h, v14.8h, v0.8h\n"
+    "fmla v24.8h, v15.8h, v0.8h\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "fmla v25.8h, v16.8h, v0.8h\n"
+    "fmla v26.8h, v17.8h, v0.8h\n"
+    "ldr x21, [x10], #0x8\n"
+    "fmla v27.8h, v18.8h, v0.8h\n"
+    "fmla v28.8h, v19.8h, v0.8h\n"
+    "add x9, x9, x11\n"
+    "fmla v29.8h, v20.8h, v0.8h\n"
+    "fmla v30.8h, v21.8h, v0.8h\n"
+    "add x28, x28, x11\n"
+    "add x27, x27, x11\n"
+    "fmla v31.8h, v22.8h, v0.8h\n"
+    "ldr q0, [%x[params], #0x0]\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "add %x[params], %x[params], #0x10\n"
+    "tbz %x[n_channels], #2, 17f\n"
+    "ldr d14, [x9], #0x8\n"
+    "ldr d15, [x28], #0x8\n"
+    "ldr d16, [x27], #0x8\n"
+    "ldr d17, [x26], #0x8\n"
+    "ldr d18, [x25], #0x8\n"
+    "ldr d19, [x24], #0x8\n"
+    "ldr d20, [x23], #0x8\n"
+    "ldr d21, [x22], #0x8\n"
+    "ldr d22, [x21], #0x8\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v14.s }[2], [x9], #0x4\n"
+    "ld1 { v15.s }[2], [x28], #0x4\n"
+    "ld1 { v16.s }[2], [x27], #0x4\n"
+    "ld1 { v17.s }[2], [x26], #0x4\n"
+    "ld1 { v18.s }[2], [x25], #0x4\n"
+    "ld1 { v19.s }[2], [x24], #0x4\n"
+    "ld1 { v20.s }[2], [x23], #0x4\n"
+    "ld1 { v21.s }[2], [x22], #0x4\n"
+    "ld1 { v22.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v14.h }[6], [x9], #0x2\n"
+    "ld1 { v15.h }[6], [x28], #0x2\n"
+    "ld1 { v16.h }[6], [x27], #0x2\n"
+    "ld1 { v17.h }[6], [x26], #0x2\n"
+    "ld1 { v18.h }[6], [x25], #0x2\n"
+    "ld1 { v19.h }[6], [x24], #0x2\n"
+    "ld1 { v20.h }[6], [x23], #0x2\n"
+    "ld1 { v21.h }[6], [x22], #0x2\n"
+    "ld1 { v22.h }[6], [x21], #0x2\n"
+    "b 19f\n"
+    "16:"  // Oddments: Planar loop: Load: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v14.h }[4], [x9], #0x2\n"
+    "ld1 { v15.h }[4], [x28], #0x2\n"
+    "ld1 { v16.h }[4], [x27], #0x2\n"
+    "ld1 { v17.h }[4], [x26], #0x2\n"
+    "ld1 { v18.h }[4], [x25], #0x2\n"
+    "ld1 { v19.h }[4], [x24], #0x2\n"
+    "ld1 { v20.h }[4], [x23], #0x2\n"
+    "ld1 { v21.h }[4], [x22], #0x2\n"
+    "ld1 { v22.h }[4], [x21], #0x2\n"
+    "b 19f\n"
+    "17:"  // Oddments: Planar loop: Load: Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ldr s14, [x9], #0x4\n"
+    "ldr s15, [x28], #0x4\n"
+    "ldr s16, [x27], #0x4\n"
+    "ldr s17, [x26], #0x4\n"
+    "ldr s18, [x25], #0x4\n"
+    "ldr s19, [x24], #0x4\n"
+    "ldr s20, [x23], #0x4\n"
+    "ldr s21, [x22], #0x4\n"
+    "ldr s22, [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v14.h }[2], [x9], #0x2\n"
+    "ld1 { v15.h }[2], [x28], #0x2\n"
+    "ld1 { v16.h }[2], [x27], #0x2\n"
+    "ld1 { v17.h }[2], [x26], #0x2\n"
+    "ld1 { v18.h }[2], [x25], #0x2\n"
+    "ld1 { v19.h }[2], [x24], #0x2\n"
+    "ld1 { v20.h }[2], [x23], #0x2\n"
+    "ld1 { v21.h }[2], [x22], #0x2\n"
+    "ld1 { v22.h }[2], [x21], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Planar loop: Load: Bit 2: Unset: Bit 1: Unset
+    "ldr h14, [x9], #0x2\n"
+    "ldr h15, [x28], #0x2\n"
+    "ldr h16, [x27], #0x2\n"
+    "ldr h17, [x26], #0x2\n"
+    "ldr h18, [x25], #0x2\n"
+    "ldr h19, [x24], #0x2\n"
+    "ldr h20, [x23], #0x2\n"
+    "ldr h21, [x22], #0x2\n"
+    "ldr h22, [x21], #0x2\n"
+    "19:"  // Oddments: Planar loop: Load: Bit 2: End
+    "subs x20, x20, #0x1\n"
+    "bgt 15b\n"
+    "20:"  // Oddments: Planar tail
+    "fmla v23.8h, v14.8h, v0.8h\n"
+    "fmla v24.8h, v15.8h, v0.8h\n"
+    "fmax v23.8h, v23.8h, v2.8h\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "fmla v25.8h, v16.8h, v0.8h\n"
+    "fmla v26.8h, v17.8h, v0.8h\n"
+    "fmax v24.8h, v24.8h, v2.8h\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "fmla v27.8h, v18.8h, v0.8h\n"
+    "fmla v28.8h, v19.8h, v0.8h\n"
+    "fmax v25.8h, v25.8h, v2.8h\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "fmla v29.8h, v20.8h, v0.8h\n"
+    "fmla v30.8h, v21.8h, v0.8h\n"
+    "fmax v26.8h, v26.8h, v2.8h\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "fmla v31.8h, v22.8h, v0.8h\n"
+    "fmax v27.8h, v27.8h, v2.8h\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "add x28, x28, x11\n"
+    "fmax v28.8h, v28.8h, v2.8h\n"
+    "fmax v29.8h, v29.8h, v2.8h\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "fmax v30.8h, v30.8h, v2.8h\n"
+    "fmax v31.8h, v31.8h, v2.8h\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "fmin v23.8h, v23.8h, v1.8h\n"
+    "fmin v24.8h, v24.8h, v1.8h\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "fmin v25.8h, v25.8h, v1.8h\n"
+    "fmin v26.8h, v26.8h, v1.8h\n"
+    "add x21, x21, x11\n"
+    "add x20, x20, x11\n"
+    "fmin v27.8h, v27.8h, v1.8h\n"
+    "fmin v28.8h, v28.8h, v1.8h\n"
+    "fmin v29.8h, v29.8h, v1.8h\n"
+    "fmin v30.8h, v30.8h, v1.8h\n"
+    "fmin v31.8h, v31.8h, v1.8h\n"
+    "tbz %x[n_channels], #2, 22f\n"
+    "st1 { v23.d }[0], [x28], #0x8\n"
+    "st1 { v24.d }[0], [x27], #0x8\n"
+    "st1 { v25.d }[0], [x26], #0x8\n"
+    "st1 { v26.d }[0], [x25], #0x8\n"
+    "st1 { v27.d }[0], [x24], #0x8\n"
+    "st1 { v28.d }[0], [x23], #0x8\n"
+    "st1 { v29.d }[0], [x22], #0x8\n"
+    "st1 { v30.d }[0], [x21], #0x8\n"
+    "st1 { v31.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "st1 { v23.s }[2], [x28], #0x4\n"
+    "st1 { v24.s }[2], [x27], #0x4\n"
+    "st1 { v25.s }[2], [x26], #0x4\n"
+    "st1 { v26.s }[2], [x25], #0x4\n"
+    "st1 { v27.s }[2], [x24], #0x4\n"
+    "st1 { v28.s }[2], [x23], #0x4\n"
+    "st1 { v29.s }[2], [x22], #0x4\n"
+    "st1 { v30.s }[2], [x21], #0x4\n"
+    "st1 { v31.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "st1 { v23.h }[6], [x28], #0x2\n"
+    "st1 { v24.h }[6], [x27], #0x2\n"
+    "st1 { v25.h }[6], [x26], #0x2\n"
+    "st1 { v26.h }[6], [x25], #0x2\n"
+    "st1 { v27.h }[6], [x24], #0x2\n"
+    "st1 { v28.h }[6], [x23], #0x2\n"
+    "st1 { v29.h }[6], [x22], #0x2\n"
+    "st1 { v30.h }[6], [x21], #0x2\n"
+    "st1 { v31.h }[6], [x20], #0x2\n"
+    "b 24f\n"
+    "21:"  // Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 24f\n"
+    "st1 { v23.h }[4], [x28], #0x2\n"
+    "st1 { v24.h }[4], [x27], #0x2\n"
+    "st1 { v25.h }[4], [x26], #0x2\n"
+    "st1 { v26.h }[4], [x25], #0x2\n"
+    "st1 { v27.h }[4], [x24], #0x2\n"
+    "st1 { v28.h }[4], [x23], #0x2\n"
+    "st1 { v29.h }[4], [x22], #0x2\n"
+    "st1 { v30.h }[4], [x21], #0x2\n"
+    "st1 { v31.h }[4], [x20], #0x2\n"
+    "b 24f\n"
+    "22:"  // Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 23f\n"
+    "st1 { v23.s }[0], [x28], #0x4\n"
+    "st1 { v24.s }[0], [x27], #0x4\n"
+    "st1 { v25.s }[0], [x26], #0x4\n"
+    "st1 { v26.s }[0], [x25], #0x4\n"
+    "st1 { v27.s }[0], [x24], #0x4\n"
+    "st1 { v28.s }[0], [x23], #0x4\n"
+    "st1 { v29.s }[0], [x22], #0x4\n"
+    "st1 { v30.s }[0], [x21], #0x4\n"
+    "st1 { v31.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "st1 { v23.h }[2], [x28], #0x2\n"
+    "st1 { v24.h }[2], [x27], #0x2\n"
+    "st1 { v25.h }[2], [x26], #0x2\n"
+    "st1 { v26.h }[2], [x25], #0x2\n"
+    "st1 { v27.h }[2], [x24], #0x2\n"
+    "st1 { v28.h }[2], [x23], #0x2\n"
+    "st1 { v29.h }[2], [x22], #0x2\n"
+    "st1 { v30.h }[2], [x21], #0x2\n"
+    "st1 { v31.h }[2], [x20], #0x2\n"
+    "b 24f\n"
+    "23:"  // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "st1 { v23.h }[0], [x28], #0x2\n"
+    "st1 { v24.h }[0], [x27], #0x2\n"
+    "st1 { v25.h }[0], [x26], #0x2\n"
+    "st1 { v26.h }[0], [x25], #0x2\n"
+    "st1 { v27.h }[0], [x24], #0x2\n"
+    "st1 { v28.h }[0], [x23], #0x2\n"
+    "st1 { v29.h }[0], [x22], #0x2\n"
+    "st1 { v30.h }[0], [x21], #0x2\n"
+    "st1 { v31.h }[0], [x20], #0x2\n"
+    "24:"  // Oddments: Store: Bit 2: End
+    "25:"  // End
+    : [params] "+&r" (params)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..3646c18b04
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const __fp16 *const *const, __fp16 *const *const, const __fp16 *, const __fp16 *, const unsigned int, const unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  using Parent = GenericDepthfirstMultiplierKernelStrategy<__fp16, __fp16, __fp16, __fp16>;
+  a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+  : Parent(2, 8, arm_gemm::VLType::None)
+  {
+  }
+  Parent::KernelType kernel = a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..cee3fb59c5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,1044 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const __fp16 *const *const inptrs,
+  __fp16 *const *const outptrs,
+  const __fp16 *weights,
+  const __fp16 *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  const __fp16 minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ld1r { v8.8h }, [%x[minmax_vals]]\n"
+    "lsr x11, %x[n_output_channels], #0x3\n"
+    "add x20, %x[minmax_vals], #0x2\n"
+    "ld1r { v7.8h }, [x20]\n"
+    "mov x10, #0x0\n"
+    "cbz x11, 8f\n"
+    "1:"  // Output channel loop
+    "movi v31.16b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x20, x10, #0x1\n"
+    "ldr q31, [%x[bias], x20]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "ldr q6, [%x[weights], #0x0]\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr q1, [x21, #0x0]\n"
+    "ldr q0, [x20, #0x0]\n"
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz x23, 6f\n"
+    "ldr q5, [%x[weights], #0x0]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "ldr q4, [x21, #0x0]\n"
+    "ldr q3, [x20, #0x0]\n"
+    "beq 4f\n"
+    "3:"  // Output channel loop: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v16.8h, v6.8h, v1.h[0]\n"
+    "fmla v17.8h, v6.8h, v1.h[1]\n"
+    "subs x23, x23, #0x1\n"
+    "fmla v18.8h, v6.8h, v1.h[2]\n"
+    "fmla v19.8h, v6.8h, v1.h[3]\n"
+    "fmla v20.8h, v6.8h, v1.h[4]\n"
+    "fmla v21.8h, v6.8h, v1.h[5]\n"
+    "fmla v22.8h, v6.8h, v1.h[6]\n"
+    "fmla v23.8h, v6.8h, v1.h[7]\n"
+    "ldr q1, [x21, #0x0]\n"
+    "fmla v24.8h, v6.8h, v0.h[0]\n"
+    "fmla v25.8h, v6.8h, v0.h[1]\n"
+    "fmla v26.8h, v6.8h, v0.h[2]\n"
+    "fmla v27.8h, v6.8h, v0.h[3]\n"
+    "fmla v28.8h, v6.8h, v0.h[4]\n"
+    "fmla v29.8h, v6.8h, v0.h[5]\n"
+    "fmla v30.8h, v6.8h, v0.h[6]\n"
+    "fmla v31.8h, v6.8h, v0.h[7]\n"
+    "ldr q0, [x20, #0x0]\n"
+    "ldr q6, [%x[weights], #0x0]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v16.8h, v5.8h, v4.h[0]\n"
+    "fmla v17.8h, v5.8h, v4.h[1]\n"
+    "fmla v18.8h, v5.8h, v4.h[2]\n"
+    "fmla v19.8h, v5.8h, v4.h[3]\n"
+    "fmla v20.8h, v5.8h, v4.h[4]\n"
+    "fmla v21.8h, v5.8h, v4.h[5]\n"
+    "fmla v22.8h, v5.8h, v4.h[6]\n"
+    "fmla v23.8h, v5.8h, v4.h[7]\n"
+    "ldr q4, [x21, #0x0]\n"
+    "fmla v24.8h, v5.8h, v3.h[0]\n"
+    "fmla v25.8h, v5.8h, v3.h[1]\n"
+    "fmla v26.8h, v5.8h, v3.h[2]\n"
+    "fmla v27.8h, v5.8h, v3.h[3]\n"
+    "fmla v28.8h, v5.8h, v3.h[4]\n"
+    "fmla v29.8h, v5.8h, v3.h[5]\n"
+    "fmla v30.8h, v5.8h, v3.h[6]\n"
+    "fmla v31.8h, v5.8h, v3.h[7]\n"
+    "ldr q3, [x20, #0x0]\n"
+    "ldr q5, [%x[weights], #0x10]\n"
+    "add %x[weights], %x[weights], #0x20\n"
+    "bgt 3b\n"
+    "4:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 5f\n"
+    "fmla v16.8h, v6.8h, v1.h[0]\n"
+    "fmla v17.8h, v6.8h, v1.h[1]\n"
+    "lsl x28, x10, #0x1\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "fmla v18.8h, v6.8h, v1.h[2]\n"
+    "fmla v19.8h, v6.8h, v1.h[3]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "fmla v20.8h, v6.8h, v1.h[4]\n"
+    "fmla v21.8h, v6.8h, v1.h[5]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla v22.8h, v6.8h, v1.h[6]\n"
+    "fmla v23.8h, v6.8h, v1.h[7]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "fmla v24.8h, v6.8h, v0.h[0]\n"
+    "fmla v25.8h, v6.8h, v0.h[1]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmla v26.8h, v6.8h, v0.h[2]\n"
+    "fmla v27.8h, v6.8h, v0.h[3]\n"
+    "fmla v28.8h, v6.8h, v0.h[4]\n"
+    "fmla v29.8h, v6.8h, v0.h[5]\n"
+    "fmla v30.8h, v6.8h, v0.h[6]\n"
+    "fmla v31.8h, v6.8h, v0.h[7]\n"
+    "fmla v16.8h, v5.8h, v4.h[0]\n"
+    "fmla v17.8h, v5.8h, v4.h[1]\n"
+    "fmin v16.8h, v16.8h, v7.8h\n"
+    "fmla v18.8h, v5.8h, v4.h[2]\n"
+    "fmla v19.8h, v5.8h, v4.h[3]\n"
+    "fmin v17.8h, v17.8h, v7.8h\n"
+    "fmla v20.8h, v5.8h, v4.h[4]\n"
+    "fmla v21.8h, v5.8h, v4.h[5]\n"
+    "fmin v18.8h, v18.8h, v7.8h\n"
+    "fmla v22.8h, v5.8h, v4.h[6]\n"
+    "fmla v23.8h, v5.8h, v4.h[7]\n"
+    "fmin v19.8h, v19.8h, v7.8h\n"
+    "fmla v24.8h, v5.8h, v3.h[0]\n"
+    "fmla v25.8h, v5.8h, v3.h[1]\n"
+    "fmin v20.8h, v20.8h, v7.8h\n"
+    "fmla v26.8h, v5.8h, v3.h[2]\n"
+    "fmla v27.8h, v5.8h, v3.h[3]\n"
+    "fmin v21.8h, v21.8h, v7.8h\n"
+    "fmla v28.8h, v5.8h, v3.h[4]\n"
+    "fmla v29.8h, v5.8h, v3.h[5]\n"
+    "fmin v22.8h, v22.8h, v7.8h\n"
+    "fmla v30.8h, v5.8h, v3.h[6]\n"
+    "fmla v31.8h, v5.8h, v3.h[7]\n"
+    "fmin v23.8h, v23.8h, v7.8h\n"
+    "fmax v16.8h, v16.8h, v8.8h\n"
+    "fmax v17.8h, v17.8h, v8.8h\n"
+    "str q16, [x27, x28]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmax v18.8h, v18.8h, v8.8h\n"
+    "fmax v19.8h, v19.8h, v8.8h\n"
+    "str q17, [x26, x28]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmax v20.8h, v20.8h, v8.8h\n"
+    "fmax v21.8h, v21.8h, v8.8h\n"
+    "str q18, [x25, x28]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmax v22.8h, v22.8h, v8.8h\n"
+    "fmax v23.8h, v23.8h, v8.8h\n"
+    "str q19, [x24, x28]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "fmin v24.8h, v24.8h, v7.8h\n"
+    "fmin v25.8h, v25.8h, v7.8h\n"
+    "str q20, [x23, x28]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmin v26.8h, v26.8h, v7.8h\n"
+    "fmin v27.8h, v27.8h, v7.8h\n"
+    "str q21, [x22, x28]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmin v28.8h, v28.8h, v7.8h\n"
+    "fmin v29.8h, v29.8h, v7.8h\n"
+    "str q22, [x21, x28]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "fmin v30.8h, v30.8h, v7.8h\n"
+    "fmin v31.8h, v31.8h, v7.8h\n"
+    "str q23, [x20, x28]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax v24.8h, v24.8h, v8.8h\n"
+    "fmax v25.8h, v25.8h, v8.8h\n"
+    "str q24, [x27, x28]\n"
+    "fmax v26.8h, v26.8h, v8.8h\n"
+    "fmax v27.8h, v27.8h, v8.8h\n"
+    "str q25, [x26, x28]\n"
+    "fmax v28.8h, v28.8h, v8.8h\n"
+    "fmax v29.8h, v29.8h, v8.8h\n"
+    "str q26, [x25, x28]\n"
+    "fmax v30.8h, v30.8h, v8.8h\n"
+    "fmax v31.8h, v31.8h, v8.8h\n"
+    "str q27, [x24, x28]\n"
+    "str q28, [x23, x28]\n"
+    "str q29, [x22, x28]\n"
+    "str q30, [x21, x28]\n"
+    "str q31, [x20, x28]\n"
+    "b 7f\n"
+    "5:"  // Output channel loop: Odd tail
+    "fmla v16.8h, v6.8h, v1.h[0]\n"
+    "fmla v17.8h, v6.8h, v1.h[1]\n"
+    "ldp x20, x9, [x22], #0x10\n"
+    "lsl x28, x10, #0x1\n"
+    "fmla v18.8h, v6.8h, v1.h[2]\n"
+    "fmla v19.8h, v6.8h, v1.h[3]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "fmla v20.8h, v6.8h, v1.h[4]\n"
+    "fmla v21.8h, v6.8h, v1.h[5]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "fmla v22.8h, v6.8h, v1.h[6]\n"
+    "fmla v23.8h, v6.8h, v1.h[7]\n"
+    "ldr q2, [x20, #0x0]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla v24.8h, v6.8h, v0.h[0]\n"
+    "fmla v25.8h, v6.8h, v0.h[1]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "fmla v26.8h, v6.8h, v0.h[2]\n"
+    "fmla v27.8h, v6.8h, v0.h[3]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmla v28.8h, v6.8h, v0.h[4]\n"
+    "fmla v29.8h, v6.8h, v0.h[5]\n"
+    "fmla v30.8h, v6.8h, v0.h[6]\n"
+    "fmla v31.8h, v6.8h, v0.h[7]\n"
+    "ldr q1, [%x[weights], #0x0]\n"
+    "ldr q0, [x9, #0x0]\n"
+    "fmla v16.8h, v5.8h, v4.h[0]\n"
+    "fmla v17.8h, v5.8h, v4.h[1]\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "fmla v18.8h, v5.8h, v4.h[2]\n"
+    "fmla v19.8h, v5.8h, v4.h[3]\n"
+    "fmla v20.8h, v5.8h, v4.h[4]\n"
+    "fmla v21.8h, v5.8h, v4.h[5]\n"
+    "fmla v22.8h, v5.8h, v4.h[6]\n"
+    "fmla v23.8h, v5.8h, v4.h[7]\n"
+    "fmla v24.8h, v5.8h, v3.h[0]\n"
+    "fmla v25.8h, v5.8h, v3.h[1]\n"
+    "fmla v26.8h, v5.8h, v3.h[2]\n"
+    "fmla v27.8h, v5.8h, v3.h[3]\n"
+    "fmla v28.8h, v5.8h, v3.h[4]\n"
+    "fmla v29.8h, v5.8h, v3.h[5]\n"
+    "fmla v30.8h, v5.8h, v3.h[6]\n"
+    "fmla v31.8h, v5.8h, v3.h[7]\n"
+    "fmla v16.8h, v1.8h, v2.h[0]\n"
+    "fmla v17.8h, v1.8h, v2.h[1]\n"
+    "fmin v16.8h, v16.8h, v7.8h\n"
+    "fmla v18.8h, v1.8h, v2.h[2]\n"
+    "fmla v19.8h, v1.8h, v2.h[3]\n"
+    "fmin v17.8h, v17.8h, v7.8h\n"
+    "fmla v20.8h, v1.8h, v2.h[4]\n"
+    "fmla v21.8h, v1.8h, v2.h[5]\n"
+    "fmin v18.8h, v18.8h, v7.8h\n"
+    "fmla v22.8h, v1.8h, v2.h[6]\n"
+    "fmla v23.8h, v1.8h, v2.h[7]\n"
+    "fmin v19.8h, v19.8h, v7.8h\n"
+    "fmla v24.8h, v1.8h, v0.h[0]\n"
+    "fmla v25.8h, v1.8h, v0.h[1]\n"
+    "fmin v20.8h, v20.8h, v7.8h\n"
+    "fmla v26.8h, v1.8h, v0.h[2]\n"
+    "fmla v27.8h, v1.8h, v0.h[3]\n"
+    "fmin v21.8h, v21.8h, v7.8h\n"
+    "fmla v28.8h, v1.8h, v0.h[4]\n"
+    "fmla v29.8h, v1.8h, v0.h[5]\n"
+    "fmin v22.8h, v22.8h, v7.8h\n"
+    "fmla v30.8h, v1.8h, v0.h[6]\n"
+    "fmla v31.8h, v1.8h, v0.h[7]\n"
+    "fmin v23.8h, v23.8h, v7.8h\n"
+    "fmax v16.8h, v16.8h, v8.8h\n"
+    "fmax v17.8h, v17.8h, v8.8h\n"
+    "str q16, [x27, x28]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmax v18.8h, v18.8h, v8.8h\n"
+    "fmax v19.8h, v19.8h, v8.8h\n"
+    "str q17, [x26, x28]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmax v20.8h, v20.8h, v8.8h\n"
+    "fmax v21.8h, v21.8h, v8.8h\n"
+    "str q18, [x25, x28]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmax v22.8h, v22.8h, v8.8h\n"
+    "fmax v23.8h, v23.8h, v8.8h\n"
+    "str q19, [x24, x28]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "fmin v24.8h, v24.8h, v7.8h\n"
+    "fmin v25.8h, v25.8h, v7.8h\n"
+    "str q20, [x23, x28]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmin v26.8h, v26.8h, v7.8h\n"
+    "fmin v27.8h, v27.8h, v7.8h\n"
+    "str q21, [x22, x28]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmin v28.8h, v28.8h, v7.8h\n"
+    "fmin v29.8h, v29.8h, v7.8h\n"
+    "str q22, [x21, x28]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "fmin v30.8h, v30.8h, v7.8h\n"
+    "fmin v31.8h, v31.8h, v7.8h\n"
+    "str q23, [x20, x28]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax v24.8h, v24.8h, v8.8h\n"
+    "fmax v25.8h, v25.8h, v8.8h\n"
+    "str q24, [x27, x28]\n"
+    "fmax v26.8h, v26.8h, v8.8h\n"
+    "fmax v27.8h, v27.8h, v8.8h\n"
+    "str q25, [x26, x28]\n"
+    "fmax v28.8h, v28.8h, v8.8h\n"
+    "fmax v29.8h, v29.8h, v8.8h\n"
+    "str q26, [x25, x28]\n"
+    "fmax v30.8h, v30.8h, v8.8h\n"
+    "fmax v31.8h, v31.8h, v8.8h\n"
+    "str q27, [x24, x28]\n"
+    "str q28, [x23, x28]\n"
+    "str q29, [x22, x28]\n"
+    "str q30, [x21, x28]\n"
+    "str q31, [x20, x28]\n"
+    "b 7f\n"
+    "6:"  // Output channel loop: Single kernel point
+    "fmla v16.8h, v6.8h, v1.h[0]\n"
+    "fmla v17.8h, v6.8h, v1.h[1]\n"
+    "fmin v16.8h, v16.8h, v7.8h\n"
+    "lsl x28, x10, #0x1\n"
+    "fmla v18.8h, v6.8h, v1.h[2]\n"
+    "fmla v19.8h, v6.8h, v1.h[3]\n"
+    "fmin v17.8h, v17.8h, v7.8h\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "fmla v20.8h, v6.8h, v1.h[4]\n"
+    "fmla v21.8h, v6.8h, v1.h[5]\n"
+    "fmin v18.8h, v18.8h, v7.8h\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "fmla v22.8h, v6.8h, v1.h[6]\n"
+    "fmla v23.8h, v6.8h, v1.h[7]\n"
+    "fmin v19.8h, v19.8h, v7.8h\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "fmla v24.8h, v6.8h, v0.h[0]\n"
+    "fmla v25.8h, v6.8h, v0.h[1]\n"
+    "fmin v20.8h, v20.8h, v7.8h\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "fmla v26.8h, v6.8h, v0.h[2]\n"
+    "fmla v27.8h, v6.8h, v0.h[3]\n"
+    "fmin v21.8h, v21.8h, v7.8h\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla v28.8h, v6.8h, v0.h[4]\n"
+    "fmla v29.8h, v6.8h, v0.h[5]\n"
+    "fmin v22.8h, v22.8h, v7.8h\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "fmla v30.8h, v6.8h, v0.h[6]\n"
+    "fmla v31.8h, v6.8h, v0.h[7]\n"
+    "fmin v23.8h, v23.8h, v7.8h\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmax v16.8h, v16.8h, v8.8h\n"
+    "fmax v17.8h, v17.8h, v8.8h\n"
+    "str q16, [x27, x28]\n"
+    "fmax v18.8h, v18.8h, v8.8h\n"
+    "fmax v19.8h, v19.8h, v8.8h\n"
+    "str q17, [x26, x28]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmax v20.8h, v20.8h, v8.8h\n"
+    "fmax v21.8h, v21.8h, v8.8h\n"
+    "str q18, [x25, x28]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmax v22.8h, v22.8h, v8.8h\n"
+    "fmax v23.8h, v23.8h, v8.8h\n"
+    "str q19, [x24, x28]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmin v24.8h, v24.8h, v7.8h\n"
+    "fmin v25.8h, v25.8h, v7.8h\n"
+    "str q20, [x23, x28]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "fmin v26.8h, v26.8h, v7.8h\n"
+    "fmin v27.8h, v27.8h, v7.8h\n"
+    "str q21, [x22, x28]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmin v28.8h, v28.8h, v7.8h\n"
+    "fmin v29.8h, v29.8h, v7.8h\n"
+    "str q22, [x21, x28]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmin v30.8h, v30.8h, v7.8h\n"
+    "fmin v31.8h, v31.8h, v7.8h\n"
+    "str q23, [x20, x28]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax v24.8h, v24.8h, v8.8h\n"
+    "fmax v25.8h, v25.8h, v8.8h\n"
+    "str q24, [x27, x28]\n"
+    "fmax v26.8h, v26.8h, v8.8h\n"
+    "fmax v27.8h, v27.8h, v8.8h\n"
+    "str q25, [x26, x28]\n"
+    "fmax v28.8h, v28.8h, v8.8h\n"
+    "fmax v29.8h, v29.8h, v8.8h\n"
+    "str q26, [x25, x28]\n"
+    "fmax v30.8h, v30.8h, v8.8h\n"
+    "fmax v31.8h, v31.8h, v8.8h\n"
+    "str q27, [x24, x28]\n"
+    "str q28, [x23, x28]\n"
+    "str q29, [x22, x28]\n"
+    "str q30, [x21, x28]\n"
+    "str q31, [x20, x28]\n"
+    "7:"  // Output channel loop: Done
+    "add x10, x10, #0x8\n"
+    "cmp x10, x11, LSL #3\n"
+    "blt 1b\n"
+    "tst %x[n_output_channels], #0x7\n"
+    "beq 23f\n"
+    "8:"  // Output channel oddments
+    "movi v31.16b, #0x0\n"
+    "cbz %x[bias], 13f\n"
+    "add x20, %x[bias], x10, LSL #1\n"
+    "tbz %x[n_output_channels], #2, 10f\n"
+    "ld1 { v31.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #1, 9f\n"
+    "ld1 { v31.s }[2], [x20], #0x4\n"
+    "tbz %x[n_output_channels], #0, 12f\n"
+    "ld1 { v31.h }[6], [x20]\n"
+    "b 12f\n"
+    "9:"  // Output channel oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 12f\n"
+    "ld1 { v31.h }[4], [x20]\n"
+    "b 12f\n"
+    "10:"  // Output channel oddments: Load bias: Bit 2: Unset
+    "tbz %x[n_output_channels], #1, 11f\n"
+    "ld1 { v31.s }[0], [x20], #0x4\n"
+    "tbz %x[n_output_channels], #0, 12f\n"
+    "ld1 { v31.h }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Output channel oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "ld1 { v31.h }[0], [x20]\n"
+    "12:"  // Output channel oddments: Load bias: Bit 2: End
+    "13:"  // Output channel oddments: Load bias: Done
+    "ldr q6, [%x[weights], #0x0]\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr q1, [x21, #0x0]\n"
+    "ldr q0, [x20, #0x0]\n"
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz x23, 17f\n"
+    "ldr q5, [%x[weights], #0x0]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "ldr q4, [x21, #0x0]\n"
+    "ldr q3, [x20, #0x0]\n"
+    "beq 15f\n"
+    "14:"  // Output channel oddments: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v16.8h, v6.8h, v1.h[0]\n"
+    "fmla v17.8h, v6.8h, v1.h[1]\n"
+    "subs x23, x23, #0x1\n"
+    "fmla v18.8h, v6.8h, v1.h[2]\n"
+    "fmla v19.8h, v6.8h, v1.h[3]\n"
+    "fmla v20.8h, v6.8h, v1.h[4]\n"
+    "fmla v21.8h, v6.8h, v1.h[5]\n"
+    "fmla v22.8h, v6.8h, v1.h[6]\n"
+    "fmla v23.8h, v6.8h, v1.h[7]\n"
+    "ldr q1, [x21, #0x0]\n"
+    "fmla v24.8h, v6.8h, v0.h[0]\n"
+    "fmla v25.8h, v6.8h, v0.h[1]\n"
+    "fmla v26.8h, v6.8h, v0.h[2]\n"
+    "fmla v27.8h, v6.8h, v0.h[3]\n"
+    "fmla v28.8h, v6.8h, v0.h[4]\n"
+    "fmla v29.8h, v6.8h, v0.h[5]\n"
+    "fmla v30.8h, v6.8h, v0.h[6]\n"
+    "fmla v31.8h, v6.8h, v0.h[7]\n"
+    "ldr q0, [x20, #0x0]\n"
+    "ldr q6, [%x[weights], #0x0]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v16.8h, v5.8h, v4.h[0]\n"
+    "fmla v17.8h, v5.8h, v4.h[1]\n"
+    "fmla v18.8h, v5.8h, v4.h[2]\n"
+    "fmla v19.8h, v5.8h, v4.h[3]\n"
+    "fmla v20.8h, v5.8h, v4.h[4]\n"
+    "fmla v21.8h, v5.8h, v4.h[5]\n"
+    "fmla v22.8h, v5.8h, v4.h[6]\n"
+    "fmla v23.8h, v5.8h, v4.h[7]\n"
+    "ldr q4, [x21, #0x0]\n"
+    "fmla v24.8h, v5.8h, v3.h[0]\n"
+    "fmla v25.8h, v5.8h, v3.h[1]\n"
+    "fmla v26.8h, v5.8h, v3.h[2]\n"
+    "fmla v27.8h, v5.8h, v3.h[3]\n"
+    "fmla v28.8h, v5.8h, v3.h[4]\n"
+    "fmla v29.8h, v5.8h, v3.h[5]\n"
+    "fmla v30.8h, v5.8h, v3.h[6]\n"
+    "fmla v31.8h, v5.8h, v3.h[7]\n"
+    "ldr q3, [x20, #0x0]\n"
+    "ldr q5, [%x[weights], #0x10]\n"
+    "add %x[weights], %x[weights], #0x20\n"
+    "bgt 14b\n"
+    "15:"  // Output channel oddments: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 16f\n"
+    "fmla v16.8h, v6.8h, v1.h[0]\n"
+    "fmla v17.8h, v6.8h, v1.h[1]\n"
+    "fmla v18.8h, v6.8h, v1.h[2]\n"
+    "fmla v19.8h, v6.8h, v1.h[3]\n"
+    "fmla v20.8h, v6.8h, v1.h[4]\n"
+    "fmla v21.8h, v6.8h, v1.h[5]\n"
+    "fmla v22.8h, v6.8h, v1.h[6]\n"
+    "fmla v23.8h, v6.8h, v1.h[7]\n"
+    "fmla v24.8h, v6.8h, v0.h[0]\n"
+    "fmla v25.8h, v6.8h, v0.h[1]\n"
+    "fmla v26.8h, v6.8h, v0.h[2]\n"
+    "fmla v27.8h, v6.8h, v0.h[3]\n"
+    "fmla v28.8h, v6.8h, v0.h[4]\n"
+    "fmla v29.8h, v6.8h, v0.h[5]\n"
+    "fmla v30.8h, v6.8h, v0.h[6]\n"
+    "fmla v31.8h, v6.8h, v0.h[7]\n"
+    "fmla v16.8h, v5.8h, v4.h[0]\n"
+    "fmla v17.8h, v5.8h, v4.h[1]\n"
+    "fmla v18.8h, v5.8h, v4.h[2]\n"
+    "fmla v19.8h, v5.8h, v4.h[3]\n"
+    "fmla v20.8h, v5.8h, v4.h[4]\n"
+    "fmla v21.8h, v5.8h, v4.h[5]\n"
+    "fmla v22.8h, v5.8h, v4.h[6]\n"
+    "fmla v23.8h, v5.8h, v4.h[7]\n"
+    "fmla v24.8h, v5.8h, v3.h[0]\n"
+    "fmla v25.8h, v5.8h, v3.h[1]\n"
+    "fmla v26.8h, v5.8h, v3.h[2]\n"
+    "fmla v27.8h, v5.8h, v3.h[3]\n"
+    "fmla v28.8h, v5.8h, v3.h[4]\n"
+    "fmla v29.8h, v5.8h, v3.h[5]\n"
+    "fmla v30.8h, v5.8h, v3.h[6]\n"
+    "fmla v31.8h, v5.8h, v3.h[7]\n"
+    "b 18f\n"
+    "16:"  // Output channel oddments: Odd tail
+    "fmla v16.8h, v6.8h, v1.h[0]\n"
+    "fmla v17.8h, v6.8h, v1.h[1]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v18.8h, v6.8h, v1.h[2]\n"
+    "fmla v19.8h, v6.8h, v1.h[3]\n"
+    "fmla v20.8h, v6.8h, v1.h[4]\n"
+    "fmla v21.8h, v6.8h, v1.h[5]\n"
+    "fmla v22.8h, v6.8h, v1.h[6]\n"
+    "fmla v23.8h, v6.8h, v1.h[7]\n"
+    "ldr q2, [x21, #0x0]\n"
+    "fmla v24.8h, v6.8h, v0.h[0]\n"
+    "fmla v25.8h, v6.8h, v0.h[1]\n"
+    "fmla v26.8h, v6.8h, v0.h[2]\n"
+    "fmla v27.8h, v6.8h, v0.h[3]\n"
+    "fmla v28.8h, v6.8h, v0.h[4]\n"
+    "fmla v29.8h, v6.8h, v0.h[5]\n"
+    "fmla v30.8h, v6.8h, v0.h[6]\n"
+    "fmla v31.8h, v6.8h, v0.h[7]\n"
+    "ldr q1, [x20, #0x0]\n"
+    "ldr q0, [%x[weights], #0x0]\n"
+    "fmla v16.8h, v5.8h, v4.h[0]\n"
+    "fmla v17.8h, v5.8h, v4.h[1]\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "fmla v18.8h, v5.8h, v4.h[2]\n"
+    "fmla v19.8h, v5.8h, v4.h[3]\n"
+    "fmla v20.8h, v5.8h, v4.h[4]\n"
+    "fmla v21.8h, v5.8h, v4.h[5]\n"
+    "fmla v22.8h, v5.8h, v4.h[6]\n"
+    "fmla v23.8h, v5.8h, v4.h[7]\n"
+    "fmla v24.8h, v5.8h, v3.h[0]\n"
+    "fmla v25.8h, v5.8h, v3.h[1]\n"
+    "fmla v26.8h, v5.8h, v3.h[2]\n"
+    "fmla v27.8h, v5.8h, v3.h[3]\n"
+    "fmla v28.8h, v5.8h, v3.h[4]\n"
+    "fmla v29.8h, v5.8h, v3.h[5]\n"
+    "fmla v30.8h, v5.8h, v3.h[6]\n"
+    "fmla v31.8h, v5.8h, v3.h[7]\n"
+    "fmla v16.8h, v0.8h, v2.h[0]\n"
+    "fmla v17.8h, v0.8h, v2.h[1]\n"
+    "fmla v18.8h, v0.8h, v2.h[2]\n"
+    "fmla v19.8h, v0.8h, v2.h[3]\n"
+    "fmla v20.8h, v0.8h, v2.h[4]\n"
+    "fmla v21.8h, v0.8h, v2.h[5]\n"
+    "fmla v22.8h, v0.8h, v2.h[6]\n"
+    "fmla v23.8h, v0.8h, v2.h[7]\n"
+    "fmla v24.8h, v0.8h, v1.h[0]\n"
+    "fmla v25.8h, v0.8h, v1.h[1]\n"
+    "fmla v26.8h, v0.8h, v1.h[2]\n"
+    "fmla v27.8h, v0.8h, v1.h[3]\n"
+    "fmla v28.8h, v0.8h, v1.h[4]\n"
+    "fmla v29.8h, v0.8h, v1.h[5]\n"
+    "fmla v30.8h, v0.8h, v1.h[6]\n"
+    "fmla v31.8h, v0.8h, v1.h[7]\n"
+    "b 18f\n"
+    "17:"  // Output channel oddments: Single kernel point
+    "fmla v16.8h, v6.8h, v1.h[0]\n"
+    "fmla v17.8h, v6.8h, v1.h[1]\n"
+    "fmla v18.8h, v6.8h, v1.h[2]\n"
+    "fmla v19.8h, v6.8h, v1.h[3]\n"
+    "fmla v20.8h, v6.8h, v1.h[4]\n"
+    "fmla v21.8h, v6.8h, v1.h[5]\n"
+    "fmla v22.8h, v6.8h, v1.h[6]\n"
+    "fmla v23.8h, v6.8h, v1.h[7]\n"
+    "fmla v24.8h, v6.8h, v0.h[0]\n"
+    "fmla v25.8h, v6.8h, v0.h[1]\n"
+    "fmla v26.8h, v6.8h, v0.h[2]\n"
+    "fmla v27.8h, v6.8h, v0.h[3]\n"
+    "fmla v28.8h, v6.8h, v0.h[4]\n"
+    "fmla v29.8h, v6.8h, v0.h[5]\n"
+    "fmla v30.8h, v6.8h, v0.h[6]\n"
+    "fmla v31.8h, v6.8h, v0.h[7]\n"
+    "18:"  // Output channel oddments: Done
+    "fmin v16.8h, v16.8h, v7.8h\n"
+    "fmin v17.8h, v17.8h, v7.8h\n"
+    "fmin v18.8h, v18.8h, v7.8h\n"
+    "fmin v19.8h, v19.8h, v7.8h\n"
+    "fmin v20.8h, v20.8h, v7.8h\n"
+    "fmin v21.8h, v21.8h, v7.8h\n"
+    "fmin v22.8h, v22.8h, v7.8h\n"
+    "fmin v23.8h, v23.8h, v7.8h\n"
+    "fmin v24.8h, v24.8h, v7.8h\n"
+    "fmin v25.8h, v25.8h, v7.8h\n"
+    "fmin v26.8h, v26.8h, v7.8h\n"
+    "fmin v27.8h, v27.8h, v7.8h\n"
+    "fmin v28.8h, v28.8h, v7.8h\n"
+    "fmin v29.8h, v29.8h, v7.8h\n"
+    "fmin v30.8h, v30.8h, v7.8h\n"
+    "fmin v31.8h, v31.8h, v7.8h\n"
+    "fmax v16.8h, v16.8h, v8.8h\n"
+    "fmax v17.8h, v17.8h, v8.8h\n"
+    "fmax v18.8h, v18.8h, v8.8h\n"
+    "fmax v19.8h, v19.8h, v8.8h\n"
+    "fmax v20.8h, v20.8h, v8.8h\n"
+    "fmax v21.8h, v21.8h, v8.8h\n"
+    "fmax v22.8h, v22.8h, v8.8h\n"
+    "fmax v23.8h, v23.8h, v8.8h\n"
+    "fmax v24.8h, v24.8h, v8.8h\n"
+    "fmax v25.8h, v25.8h, v8.8h\n"
+    "fmax v26.8h, v26.8h, v8.8h\n"
+    "fmax v27.8h, v27.8h, v8.8h\n"
+    "fmax v28.8h, v28.8h, v8.8h\n"
+    "fmax v29.8h, v29.8h, v8.8h\n"
+    "fmax v30.8h, v30.8h, v8.8h\n"
+    "fmax v31.8h, v31.8h, v8.8h\n"
+    "tbz %x[n_output_channels], #2, 20f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "add x26, x26, x10, LSL #1\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "add x24, x24, x10, LSL #1\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "add x22, x22, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v16.d }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "st1 { v17.d }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.d }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.d }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.d }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.d }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.d }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.d }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "add x10, x10, #0x4\n"
+    "st1 { v24.d }[0], [x27]\n"
+    "st1 { v25.d }[0], [x26]\n"
+    "st1 { v26.d }[0], [x25]\n"
+    "st1 { v27.d }[0], [x24]\n"
+    "st1 { v28.d }[0], [x23]\n"
+    "st1 { v29.d }[0], [x22]\n"
+    "st1 { v30.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_output_channels], #1, 19f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "add x26, x26, x10, LSL #1\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "add x24, x24, x10, LSL #1\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "add x22, x22, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v16.s }[2], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "st1 { v17.s }[2], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.s }[2], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.s }[2], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.s }[2], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.s }[2], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.s }[2], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.s }[2], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "add x10, x10, #0x2\n"
+    "st1 { v24.s }[2], [x27]\n"
+    "st1 { v25.s }[2], [x26]\n"
+    "st1 { v26.s }[2], [x25]\n"
+    "st1 { v27.s }[2], [x24]\n"
+    "st1 { v28.s }[2], [x23]\n"
+    "st1 { v29.s }[2], [x22]\n"
+    "st1 { v30.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "tbz %x[n_output_channels], #0, 22f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "add x26, x26, x10, LSL #1\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "add x24, x24, x10, LSL #1\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "add x22, x22, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v16.h }[6], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "st1 { v17.h }[6], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.h }[6], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.h }[6], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.h }[6], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.h }[6], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.h }[6], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.h }[6], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v24.h }[6], [x27]\n"
+    "st1 { v25.h }[6], [x26]\n"
+    "st1 { v26.h }[6], [x25]\n"
+    "st1 { v27.h }[6], [x24]\n"
+    "st1 { v28.h }[6], [x23]\n"
+    "st1 { v29.h }[6], [x22]\n"
+    "st1 { v30.h }[6], [x21]\n"
+    "st1 { v31.h }[6], [x20]\n"
+    "b 22f\n"
+    "19:"  // Output channel oddments: Done: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 22f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "add x26, x26, x10, LSL #1\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "add x24, x24, x10, LSL #1\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "add x22, x22, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v16.h }[4], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "st1 { v17.h }[4], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.h }[4], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.h }[4], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.h }[4], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.h }[4], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.h }[4], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.h }[4], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v24.h }[4], [x27]\n"
+    "st1 { v25.h }[4], [x26]\n"
+    "st1 { v26.h }[4], [x25]\n"
+    "st1 { v27.h }[4], [x24]\n"
+    "st1 { v28.h }[4], [x23]\n"
+    "st1 { v29.h }[4], [x22]\n"
+    "st1 { v30.h }[4], [x21]\n"
+    "st1 { v31.h }[4], [x20]\n"
+    "b 22f\n"
+    "20:"  // Output channel oddments: Done: Store: Bit 2: Unset
+    "tbz %x[n_output_channels], #1, 21f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "add x26, x26, x10, LSL #1\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "add x24, x24, x10, LSL #1\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "add x22, x22, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v16.s }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "st1 { v17.s }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.s }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.s }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.s }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.s }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.s }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.s }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "add x10, x10, #0x2\n"
+    "st1 { v24.s }[0], [x27]\n"
+    "st1 { v25.s }[0], [x26]\n"
+    "st1 { v26.s }[0], [x25]\n"
+    "st1 { v27.s }[0], [x24]\n"
+    "st1 { v28.s }[0], [x23]\n"
+    "st1 { v29.s }[0], [x22]\n"
+    "st1 { v30.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "tbz %x[n_output_channels], #0, 22f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "add x26, x26, x10, LSL #1\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "add x24, x24, x10, LSL #1\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "add x22, x22, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v16.h }[2], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "st1 { v17.h }[2], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.h }[2], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.h }[2], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.h }[2], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.h }[2], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.h }[2], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.h }[2], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v24.h }[2], [x27]\n"
+    "st1 { v25.h }[2], [x26]\n"
+    "st1 { v26.h }[2], [x25]\n"
+    "st1 { v27.h }[2], [x24]\n"
+    "st1 { v28.h }[2], [x23]\n"
+    "st1 { v29.h }[2], [x22]\n"
+    "st1 { v30.h }[2], [x21]\n"
+    "st1 { v31.h }[2], [x20]\n"
+    "b 22f\n"
+    "21:"  // Output channel oddments: Done: Store: Bit 2: Unset: Bit 1: Unset
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "add x26, x26, x10, LSL #1\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "add x24, x24, x10, LSL #1\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "add x22, x22, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v16.h }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "st1 { v17.h }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.h }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.h }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.h }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.h }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.h }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.h }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v24.h }[0], [x27]\n"
+    "st1 { v25.h }[0], [x26]\n"
+    "st1 { v26.h }[0], [x25]\n"
+    "st1 { v27.h }[0], [x24]\n"
+    "st1 { v28.h }[0], [x23]\n"
+    "st1 { v29.h }[0], [x22]\n"
+    "st1 { v30.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "22:"  // Output channel oddments: Done: Store: Bit 2: End
+    "23:"  // Done
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..5d3db974f0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..fd8686c15e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x23, #0x0\n"
+    "mov x22, #0x0\n"
+    "1:"  // Tile loop
+    "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x27, #0x2\n"
+    "mov x26, #0x2\n"
+    "str x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x23, x25\n"  // offset = tile_i * ld_input_row
+    "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x23, x24\n"  // offset = tile_i * ld_output_row
+    "mov x23, #0x10\n"  // cntb _, ALL, #1
+    "madd x21, x22, x15, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "lsl x15, x15, #0x2\n"
+    "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "madd x20, x22, x14, x20\n"  // offset += tile_j * ld_output_col
+    "lsr x22, %x[n_channels], #0x2\n"
+    "add x11, x15, x15\n"
+    "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mul x21, x21, x27\n"  // offset *= kernel_stride * output_size
+    "add x13, x13, x21, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x9, x13, x25, LSL #2\n"
+    "mul x20, x20, x26\n"  // offset *= output_tile_size
+    "add x28, x9, x25, LSL #2\n"
+    "add x12, x12, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v27.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v26.4s }, [x20]\n"
+    "add x27, x28, x25, LSL #2\n"
+    "add x26, x11, x15\n"
+    "add x25, x12, x24, LSL #2\n"
+    "lsl x14, x14, #0x2\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x23\n"
+    "cbz x22, 4f\n"
+    "ldr q25, [x10, #0x0]\n"
+    "ldr q0, [x10, #0x10]\n"
+    "cmp x23, x22, LSL #4\n"
+    "ldr q1, [x10, #0x20]\n"
+    "ldr q2, [x10, #0x30]\n"
+    "ldr q3, [x10, #0x40]\n"
+    "ldr q4, [x10, #0x50]\n"
+    "ldr q5, [x10, #0x60]\n"
+    "ldr q6, [x10, #0x70]\n"
+    "ldr q7, [x10, #0x80]\n"
+    "ldr q8, [x10, #0x90]\n"
+    "add x10, x10, #0xa0\n"
+    "ldr q9, [x9, x15]\n"
+    "ld1 { v10.4s }, [x13]\n"
+    "ldr q11, [x13, x26]\n"
+    "ldr q12, [x9, x11]\n"
+    "ldr q13, [x28, x15]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+    "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+    "add x23, x23, #0x10\n"
+    "cmp x23, x22, LSL #4\n"
+    "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+    "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "ld1 { v18.4s }, [x27]\n"
+    "ldr q25, [x10, #0x0]\n"
+    "fmla v24.4s, v0.4s, v10.4s\n"
+    "ldr q20, [x28, x11]\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "ldr q17, [x27, x26]\n"
+    "fmla v22.4s, v2.4s, v12.4s\n"
+    "fmla v21.4s, v1.4s, v12.4s\n"
+    "add x20, x20, #0x10\n"
+    "add x21, x21, #0x10\n"
+    "fmla v24.4s, v5.4s, v12.4s\n"
+    "fmla v23.4s, v4.4s, v12.4s\n"
+    "ldr q16, [x13, x15]\n"
+    "fmla v22.4s, v6.4s, v18.4s\n"
+    "ldr q18, [x13, x11]\n"
+    "fmla v21.4s, v3.4s, v13.4s\n"
+    "add x13, x13, #0x10\n"
+    "fmla v24.4s, v7.4s, v13.4s\n"
+    "fmla v23.4s, v6.4s, v13.4s\n"
+    "fmla v22.4s, v4.4s, v13.4s\n"
+    "fmla v21.4s, v8.4s, v17.4s\n"
+    "ld1 { v17.4s }, [x9]\n"
+    "fmla v24.4s, v1.4s, v16.4s\n"
+    "fmla v23.4s, v0.4s, v16.4s\n"
+    "ldr q16, [x9, x26]\n"
+    "add x9, x9, #0x10\n"
+    "fmla v22.4s, v5.4s, v20.4s\n"
+    "fmla v21.4s, v4.4s, v20.4s\n"
+    "ldr q4, [x10, #0x50]\n"
+    "fmla v24.4s, v2.4s, v18.4s\n"
+    "fmla v23.4s, v1.4s, v18.4s\n"
+    "ld1 { v19.4s }, [x28]\n"
+    "ldr q1, [x10, #0x20]\n"
+    "fmla v22.4s, v0.4s, v17.4s\n"
+    "ldr q0, [x10, #0x10]\n"
+    "fmla v21.4s, v2.4s, v16.4s\n"
+    "ldr q2, [x10, #0x30]\n"
+    "fmla v24.4s, v8.4s, v20.4s\n"
+    "fmla v23.4s, v7.4s, v20.4s\n"
+    "ldr q18, [x28, x26]\n"
+    "add x28, x28, #0x10\n"
+    "ldr q13, [x28, x15]\n"
+    "fmla v22.4s, v3.4s, v19.4s\n"
+    "fmla v21.4s, v5.4s, v18.4s\n"
+    "fmla v24.4s, v3.4s, v17.4s\n"
+    "ldr q17, [x27, x15]\n"
+    "ldr q3, [x10, #0x40]\n"
+    "fmla v23.4s, v5.4s, v16.4s\n"
+    "ldr q16, [x27, x11]\n"
+    "ldr q5, [x10, #0x60]\n"
+    "fmla v22.4s, v7.4s, v17.4s\n"
+    "fmla v21.4s, v6.4s, v17.4s\n"
+    "ldr q11, [x13, x26]\n"
+    "fmla v24.4s, v6.4s, v19.4s\n"
+    "ldr q9, [x9, x15]\n"
+    "fmla v23.4s, v8.4s, v18.4s\n"
+    "ld1 { v10.4s }, [x13]\n"
+    "ldr q6, [x10, #0x70]\n"
+    "fmla v22.4s, v8.4s, v16.4s\n"
+    "fmla v21.4s, v7.4s, v16.4s\n"
+    "ldr q12, [x9, x11]\n"
+    "ldr q7, [x10, #0x80]\n"
+    "fmax v24.4s, v24.4s, v27.4s\n"
+    "fmax v23.4s, v23.4s, v27.4s\n"
+    "ldr q8, [x10, #0x90]\n"
+    "fmax v22.4s, v22.4s, v27.4s\n"
+    "fmax v21.4s, v21.4s, v27.4s\n"
+    "add x27, x27, #0x10\n"
+    "fmin v24.4s, v24.4s, v26.4s\n"
+    "fmin v23.4s, v23.4s, v26.4s\n"
+    "st1 { v24.4s }, [x12]\n"
+    "add x10, x10, #0xa0\n"
+    "fmin v22.4s, v22.4s, v26.4s\n"
+    "fmin v21.4s, v21.4s, v26.4s\n"
+    "str q23, [x12, x14]\n"
+    "add x12, x12, #0x10\n"
+    "st1 { v22.4s }, [x25]\n"
+    "str q21, [x25, x14]\n"
+    "add x25, x25, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+    "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+    "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+    "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "ld1 { v18.4s }, [x27]\n"
+    "fmla v24.4s, v0.4s, v10.4s\n"
+    "ldr q20, [x28, x11]\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "ldr q17, [x27, x26]\n"
+    "fmla v22.4s, v2.4s, v12.4s\n"
+    "fmla v21.4s, v1.4s, v12.4s\n"
+    "fmla v24.4s, v5.4s, v12.4s\n"
+    "fmla v23.4s, v4.4s, v12.4s\n"
+    "ldr q16, [x13, x15]\n"
+    "fmla v22.4s, v6.4s, v18.4s\n"
+    "ldr q18, [x13, x11]\n"
+    "fmla v21.4s, v3.4s, v13.4s\n"
+    "add x13, x13, #0x10\n"
+    "fmla v24.4s, v7.4s, v13.4s\n"
+    "fmla v23.4s, v6.4s, v13.4s\n"
+    "fmla v22.4s, v4.4s, v13.4s\n"
+    "fmla v21.4s, v8.4s, v17.4s\n"
+    "ld1 { v17.4s }, [x9]\n"
+    "fmla v24.4s, v1.4s, v16.4s\n"
+    "fmla v23.4s, v0.4s, v16.4s\n"
+    "ldr q16, [x9, x26]\n"
+    "add x9, x9, #0x10\n"
+    "fmla v22.4s, v5.4s, v20.4s\n"
+    "fmla v21.4s, v4.4s, v20.4s\n"
+    "fmla v24.4s, v2.4s, v18.4s\n"
+    "fmla v23.4s, v1.4s, v18.4s\n"
+    "ld1 { v19.4s }, [x28]\n"
+    "fmla v22.4s, v0.4s, v17.4s\n"
+    "fmla v21.4s, v2.4s, v16.4s\n"
+    "fmla v24.4s, v8.4s, v20.4s\n"
+    "fmla v23.4s, v7.4s, v20.4s\n"
+    "ldr q18, [x28, x26]\n"
+    "add x28, x28, #0x10\n"
+    "fmla v22.4s, v3.4s, v19.4s\n"
+    "fmla v21.4s, v5.4s, v18.4s\n"
+    "fmla v24.4s, v3.4s, v17.4s\n"
+    "ldr q17, [x27, x15]\n"
+    "fmla v23.4s, v5.4s, v16.4s\n"
+    "ldr q16, [x27, x11]\n"
+    "fmla v22.4s, v7.4s, v17.4s\n"
+    "fmla v21.4s, v6.4s, v17.4s\n"
+    "add x27, x27, #0x10\n"
+    "fmla v24.4s, v6.4s, v19.4s\n"
+    "fmla v23.4s, v8.4s, v18.4s\n"
+    "fmax v24.4s, v24.4s, v27.4s\n"
+    "fmla v22.4s, v8.4s, v16.4s\n"
+    "fmla v21.4s, v7.4s, v16.4s\n"
+    "fmax v23.4s, v23.4s, v27.4s\n"
+    "fmax v22.4s, v22.4s, v27.4s\n"
+    "fmax v21.4s, v21.4s, v27.4s\n"
+    "fmin v24.4s, v24.4s, v26.4s\n"
+    "fmin v23.4s, v23.4s, v26.4s\n"
+    "st1 { v24.4s }, [x12]\n"
+    "fmin v22.4s, v22.4s, v26.4s\n"
+    "fmin v21.4s, v21.4s, v26.4s\n"
+    "str q23, [x12, x14]\n"
+    "add x12, x12, #0x10\n"
+    "st1 { v22.4s }, [x25]\n"
+    "str q21, [x25, x14]\n"
+    "add x25, x25, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 31f\n"
+    "ldr q25, [x10, #0x0]\n"
+    "ldr q0, [x10, #0x10]\n"
+    "add x24, x9, x15\n"
+    "add x23, x13, XZR\n"
+    "ldr q1, [x10, #0x20]\n"
+    "ldr q2, [x10, #0x30]\n"
+    "add x22, x13, x26\n"
+    "add x21, x9, x11\n"
+    "ldr q3, [x10, #0x40]\n"
+    "ldr q4, [x10, #0x50]\n"
+    "add x20, x28, x15\n"
+    "ldr q5, [x10, #0x60]\n"
+    "ldr q6, [x10, #0x70]\n"
+    "ldr q7, [x10, #0x80]\n"
+    "ldr q8, [x10, #0x90]\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr d9, [x24], #0x8\n"
+    "ldr d10, [x23], #0x8\n"
+    "ldr d11, [x22], #0x8\n"
+    "ldr d12, [x21], #0x8\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v9.s }[2], [x24]\n"
+    "ld1 { v10.s }[2], [x23]\n"
+    "ld1 { v11.s }[2], [x22]\n"
+    "ld1 { v12.s }[2], [x21]\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
+    "ldr s9, [x24, #0x0]\n"
+    "ldr s10, [x23, #0x0]\n"
+    "ldr s11, [x22, #0x0]\n"
+    "ldr s12, [x21, #0x0]\n"
+    "ldr s13, [x20, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
+    "mov v28.16b, v25.16b\n fmla v28.4s, v4.4s, v9.4s\n"
+    "mov v29.16b, v25.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+    "add x20, x27, XZR\n"
+    "mov v30.16b, v25.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+    "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v5.4s, v12.4s\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v30.4s, v6.4s, v9.4s\n"
+    "fmla v28.4s, v7.4s, v13.4s\n"
+    "add x20, x27, x26\n"
+    "fmla v29.4s, v6.4s, v13.4s\n"
+    "fmla v30.4s, v4.4s, v13.4s\n"
+    "fmla v31.4s, v3.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v31.4s, v8.4s, v11.4s\n"
+    "add x20, x13, x15\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "fmla v29.4s, v0.4s, v12.4s\n"
+    "add x20, x13, x11\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v1.4s, v9.4s\n"
+    "add x20, x28, x11\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
+    "fmla v28.4s, v8.4s, v10.4s\n"
+    "fmla v29.4s, v7.4s, v10.4s\n"
+    "add x20, x9, XZR\n"
+    "fmla v30.4s, v5.4s, v10.4s\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+    "fmla v28.4s, v3.4s, v11.4s\n"
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "add x20, x9, x26\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v29.4s, v5.4s, v12.4s\n"
+    "fmla v31.4s, v2.4s, v12.4s\n"
+    "add x20, x28, XZR\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+    "fmla v28.4s, v6.4s, v9.4s\n"
+    "fmla v30.4s, v3.4s, v9.4s\n"
+    "add x20, x28, x26\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+    "fmla v29.4s, v8.4s, v10.4s\n"
+    "fmla v31.4s, v5.4s, v10.4s\n"
+    "add x20, x27, x15\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v30.4s, v7.4s, v11.4s\n"
+    "fmla v31.4s, v6.4s, v11.4s\n"
+    "add x20, x27, x11\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v30.4s, v8.4s, v12.4s\n"
+    "fmla v31.4s, v7.4s, v12.4s\n"
+    "fmax v28.4s, v28.4s, v27.4s\n"
+    "fmax v29.4s, v29.4s, v27.4s\n"
+    "fmax v30.4s, v30.4s, v27.4s\n"
+    "fmax v31.4s, v31.4s, v27.4s\n"
+    "fmin v28.4s, v28.4s, v26.4s\n"
+    "fmin v29.4s, v29.4s, v26.4s\n"
+    "fmin v30.4s, v30.4s, v26.4s\n"
+    "fmin v31.4s, v31.4s, v26.4s\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.d }[0], [x21], x14\n"
+    "st1 { v30.d }[0], [x20], x14\n"
+    "add x12, x12, #0x8\n"
+    "add x25, x25, #0x8\n"
+    "st1 { v29.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.s }[2], [x21], x14\n"
+    "st1 { v30.s }[2], [x20], x14\n"
+    "st1 { v29.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.s }[0], [x21], x14\n"
+    "st1 { v30.s }[0], [x20], x14\n"
+    "st1 { v29.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "30:"  // Tile loop: Oddments: Store: Bit 1: End
+    "31:"  // Tile loop: End
+    "ldr x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x22, x22, #0x1\n"
+    "add x21, x23, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x22, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x23, x23, x21, LT\n"
+    "csel x22, x22, XZR, LT\n"
+    "cmp x23, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..7dedfd972a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[16];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[5];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[3];
+      inptrs[3] = input_ptrs[6];
+      inptrs[4] = input_ptrs[9];
+      inptrs[5] = input_ptrs[12];
+      inptrs[6] = input_ptrs[15];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[2];
+      inptrs[9] = input_ptrs[10];
+      inptrs[10] = input_ptrs[4];
+      inptrs[11] = input_ptrs[7];
+      inptrs[12] = input_ptrs[8];
+      inptrs[13] = input_ptrs[11];
+      inptrs[14] = input_ptrs[13];
+      inptrs[15] = input_ptrs[14];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "mov x16, #0x10\n"  // cntb _, ALL, #1
+    "lsr x15, %x[n_channels], #0x2\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v27.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v26.4s }, [x20]\n"
+    "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldp x12, x11, [x21, #0x0]\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "mov x28, #0x0\n"
+    "sub x27, XZR, x16\n"
+    "cbz x15, 3f\n"
+    "ldr q25, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "cmp x16, x15, LSL #4\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "add x14, x14, #0xa0\n"
+    "ldp x21, x20, [x13, #0x0]\n"
+    "ldr q9, [x21, x28]\n"
+    "ldr q10, [x20, x28]\n"
+    "ldp x21, x20, [x13, #0x10]\n"
+    "ldr q11, [x21, x28]\n"
+    "ldr q12, [x20, x28]\n"
+    "ldr x20, [x13, #0x20]\n"
+    "ldr q13, [x20, x28]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+    "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+    "ldr x21, [x13, #0x28]\n"
+    "ldr x20, [x13, #0x30]\n"
+    "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+    "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "ldr q18, [x21, x28]\n"
+    "ldr q25, [x14, #0x0]\n"
+    "fmla v24.4s, v0.4s, v10.4s\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x21, [x13, #0x38]\n"
+    "fmla v22.4s, v2.4s, v12.4s\n"
+    "fmla v21.4s, v1.4s, v12.4s\n"
+    "ldr x20, [x13, #0x48]\n"
+    "ldr q20, [x20, x28]\n"
+    "fmla v24.4s, v5.4s, v12.4s\n"
+    "fmla v23.4s, v4.4s, v12.4s\n"
+    "ldr q16, [x21, x28]\n"
+    "ldr x20, [x13, #0x40]\n"
+    "fmla v22.4s, v6.4s, v18.4s\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v21.4s, v3.4s, v13.4s\n"
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v24.4s, v7.4s, v13.4s\n"
+    "fmla v23.4s, v6.4s, v13.4s\n"
+    "ldr x22, [x13, #0x58]\n"
+    "ldr x21, [x13, #0x60]\n"
+    "fmla v22.4s, v4.4s, v13.4s\n"
+    "fmla v21.4s, v8.4s, v17.4s\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x20, [x13, #0x68]\n"
+    "fmla v24.4s, v1.4s, v16.4s\n"
+    "fmla v23.4s, v0.4s, v16.4s\n"
+    "ldr q16, [x22, x28]\n"
+    "ldr x26, [x13, #0x70]\n"
+    "fmla v22.4s, v5.4s, v20.4s\n"
+    "fmla v21.4s, v4.4s, v20.4s\n"
+    "ldr q4, [x14, #0x50]\n"
+    "ldr x25, [x13, #0x78]\n"
+    "fmla v24.4s, v2.4s, v18.4s\n"
+    "fmla v23.4s, v1.4s, v18.4s\n"
+    "ldr q19, [x21, x28]\n"
+    "ldr q1, [x14, #0x20]\n"
+    "fmla v22.4s, v0.4s, v17.4s\n"
+    "ldr q0, [x14, #0x10]\n"
+    "fmla v21.4s, v2.4s, v16.4s\n"
+    "ldr q2, [x14, #0x30]\n"
+    "fmla v24.4s, v8.4s, v20.4s\n"
+    "fmla v23.4s, v7.4s, v20.4s\n"
+    "ldr q18, [x20, x28]\n"
+    "ldp x24, x23, [x13, #0x0]\n"
+    "fmla v22.4s, v3.4s, v19.4s\n"
+    "fmla v21.4s, v5.4s, v18.4s\n"
+    "ldp x22, x21, [x13, #0x10]\n"
+    "ldr x20, [x13, #0x20]\n"
+    "ldr q13, [x20, x16]\n"
+    "fmla v24.4s, v3.4s, v17.4s\n"
+    "ldr q17, [x26, x28]\n"
+    "fmla v23.4s, v5.4s, v16.4s\n"
+    "ldr q16, [x25, x28]\n"
+    "ldr q3, [x14, #0x40]\n"
+    "fmla v22.4s, v7.4s, v17.4s\n"
+    "fmla v21.4s, v6.4s, v17.4s\n"
+    "ldr q11, [x22, x16]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "fmla v24.4s, v6.4s, v19.4s\n"
+    "fmla v23.4s, v8.4s, v18.4s\n"
+    "ldr q9, [x24, x16]\n"
+    "ldr q10, [x23, x16]\n"
+    "fmla v22.4s, v8.4s, v16.4s\n"
+    "fmla v21.4s, v7.4s, v16.4s\n"
+    "ldr q12, [x21, x16]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "fmax v24.4s, v24.4s, v27.4s\n"
+    "fmax v23.4s, v23.4s, v27.4s\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "fmax v22.4s, v22.4s, v27.4s\n"
+    "fmax v21.4s, v21.4s, v27.4s\n"
+    "add x16, x16, #0x10\n"
+    "add x27, x27, #0x10\n"
+    "fmin v24.4s, v24.4s, v26.4s\n"
+    "fmin v23.4s, v23.4s, v26.4s\n"
+    "cmp x16, x15, LSL #4\n"
+    "fmin v22.4s, v22.4s, v26.4s\n"
+    "fmin v21.4s, v21.4s, v26.4s\n"
+    "add x28, x28, #0x10\n"
+    "str q24, [x12, x27]\n"
+    "add x14, x14, #0xa0\n"
+    "str q23, [x11, x27]\n"
+    "str q22, [x10, x27]\n"
+    "str q21, [x9, x27]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+    "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+    "ldr x21, [x13, #0x28]\n"
+    "ldr x20, [x13, #0x30]\n"
+    "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+    "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "ldr q18, [x21, x28]\n"
+    "ldr x21, [x13, #0x38]\n"
+    "fmla v24.4s, v0.4s, v10.4s\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x20, [x13, #0x48]\n"
+    "ldr q20, [x20, x28]\n"
+    "fmla v22.4s, v2.4s, v12.4s\n"
+    "fmla v21.4s, v1.4s, v12.4s\n"
+    "ldr x20, [x13, #0x40]\n"
+    "fmla v24.4s, v5.4s, v12.4s\n"
+    "fmla v23.4s, v4.4s, v12.4s\n"
+    "ldr q16, [x21, x28]\n"
+    "ldr x21, [x13, #0x50]\n"
+    "fmla v22.4s, v6.4s, v18.4s\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v21.4s, v3.4s, v13.4s\n"
+    "ldr x20, [x13, #0x58]\n"
+    "fmla v24.4s, v7.4s, v13.4s\n"
+    "fmla v23.4s, v6.4s, v13.4s\n"
+    "ldr x23, [x13, #0x60]\n"
+    "ldr x22, [x13, #0x68]\n"
+    "fmla v22.4s, v4.4s, v13.4s\n"
+    "fmla v21.4s, v8.4s, v17.4s\n"
+    "ldr q17, [x21, x28]\n"
+    "ldr x21, [x13, #0x70]\n"
+    "fmla v24.4s, v1.4s, v16.4s\n"
+    "fmla v23.4s, v0.4s, v16.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr x20, [x13, #0x78]\n"
+    "fmla v22.4s, v5.4s, v20.4s\n"
+    "fmla v21.4s, v4.4s, v20.4s\n"
+    "add x27, x27, #0x10\n"
+    "fmla v24.4s, v2.4s, v18.4s\n"
+    "fmla v23.4s, v1.4s, v18.4s\n"
+    "ldr q19, [x23, x28]\n"
+    "fmla v22.4s, v0.4s, v17.4s\n"
+    "fmla v21.4s, v2.4s, v16.4s\n"
+    "fmla v24.4s, v8.4s, v20.4s\n"
+    "fmla v23.4s, v7.4s, v20.4s\n"
+    "ldr q18, [x22, x28]\n"
+    "fmla v22.4s, v3.4s, v19.4s\n"
+    "fmla v21.4s, v5.4s, v18.4s\n"
+    "fmla v24.4s, v3.4s, v17.4s\n"
+    "ldr q17, [x21, x28]\n"
+    "fmla v23.4s, v5.4s, v16.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v22.4s, v7.4s, v17.4s\n"
+    "fmla v21.4s, v6.4s, v17.4s\n"
+    "add x28, x28, #0x10\n"
+    "fmla v24.4s, v6.4s, v19.4s\n"
+    "fmla v23.4s, v8.4s, v18.4s\n"
+    "fmax v24.4s, v24.4s, v27.4s\n"
+    "fmla v22.4s, v8.4s, v16.4s\n"
+    "fmla v21.4s, v7.4s, v16.4s\n"
+    "fmax v23.4s, v23.4s, v27.4s\n"
+    "fmax v22.4s, v22.4s, v27.4s\n"
+    "fmax v21.4s, v21.4s, v27.4s\n"
+    "fmin v24.4s, v24.4s, v26.4s\n"
+    "fmin v23.4s, v23.4s, v26.4s\n"
+    "str q24, [x12, x27]\n"
+    "fmin v22.4s, v22.4s, v26.4s\n"
+    "fmin v21.4s, v21.4s, v26.4s\n"
+    "str q23, [x11, x27]\n"
+    "str q22, [x10, x27]\n"
+    "str q21, [x9, x27]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 30f\n"
+    "ldr q25, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "mov x20, x28\n"
+    "add x12, x12, x20\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "add x11, x11, x20\n"
+    "add x10, x10, x20\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "add x9, x9, x20\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "ldr x24, [x13, #0x0]\n"
+    "ldr x23, [x13, #0x8]\n"
+    "add x24, x24, x28\n"
+    "add x23, x23, x28\n"
+    "ldr x22, [x13, #0x10]\n"
+    "ldr x21, [x13, #0x18]\n"
+    "add x22, x22, x28\n"
+    "add x21, x21, x28\n"
+    "ldr x20, [x13, #0x20]\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.d }[0], [x24], #0x8\n"
+    "ld1 { v10.d }[0], [x23], #0x8\n"
+    "ld1 { v11.d }[0], [x22], #0x8\n"
+    "ld1 { v12.d }[0], [x21], #0x8\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v9.s }[2], [x24], #0x4\n"
+    "ld1 { v10.s }[2], [x23], #0x4\n"
+    "ld1 { v11.s }[2], [x22], #0x4\n"
+    "ld1 { v12.s }[2], [x21], #0x4\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
+    "ld1 { v9.s }[0], [x24], #0x4\n"
+    "ld1 { v10.s }[0], [x23], #0x4\n"
+    "ld1 { v11.s }[0], [x22], #0x4\n"
+    "ld1 { v12.s }[0], [x21], #0x4\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "5:"  // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
+    "mov v28.16b, v25.16b\n fmla v28.4s, v4.4s, v9.4s\n"
+    "mov v29.16b, v25.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+    "ldr x20, [x13, #0x28]\n"
+    "add x20, x20, x28\n"
+    "mov v30.16b, v25.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+    "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v5.4s, v12.4s\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "7:"  // Oddments: Load input (3, 0): Bit 1: End
+    "fmla v30.4s, v6.4s, v9.4s\n"
+    "ldr x20, [x13, #0x30]\n"
+    "fmla v28.4s, v7.4s, v13.4s\n"
+    "add x20, x20, x28\n"
+    "fmla v29.4s, v6.4s, v13.4s\n"
+    "fmla v30.4s, v4.4s, v13.4s\n"
+    "fmla v31.4s, v3.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "9:"  // Oddments: Load input (3, 3): Bit 1: End
+    "ldr x20, [x13, #0x38]\n"
+    "fmla v31.4s, v8.4s, v11.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (0, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "11:"  // Oddments: Load input (0, 1): Bit 1: End
+    "ldr x20, [x13, #0x40]\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "fmla v29.4s, v0.4s, v12.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (0, 2): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "13:"  // Oddments: Load input (0, 2): Bit 1: End
+    "ldr x20, [x13, #0x48]\n"
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v1.4s, v9.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "15:"  // Oddments: Load input (2, 2): Bit 1: End
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v28.4s, v8.4s, v10.4s\n"
+    "fmla v29.4s, v7.4s, v10.4s\n"
+    "add x20, x20, x28\n"
+    "fmla v30.4s, v5.4s, v10.4s\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (1, 0): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "17:"  // Oddments: Load input (1, 0): Bit 1: End
+    "ldr x20, [x13, #0x58]\n"
+    "fmla v28.4s, v3.4s, v11.4s\n"
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "19:"  // Oddments: Load input (1, 3): Bit 1: End
+    "ldr x20, [x13, #0x60]\n"
+    "fmla v29.4s, v5.4s, v12.4s\n"
+    "fmla v31.4s, v2.4s, v12.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (2, 0): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "21:"  // Oddments: Load input (2, 0): Bit 1: End
+    "ldr x20, [x13, #0x68]\n"
+    "fmla v28.4s, v6.4s, v9.4s\n"
+    "fmla v30.4s, v3.4s, v9.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (2, 3): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "23:"  // Oddments: Load input (2, 3): Bit 1: End
+    "ldr x20, [x13, #0x70]\n"
+    "fmla v29.4s, v8.4s, v10.4s\n"
+    "fmla v31.4s, v5.4s, v10.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "25:"  // Oddments: Load input (3, 1): Bit 1: End
+    "ldr x20, [x13, #0x78]\n"
+    "fmla v30.4s, v7.4s, v11.4s\n"
+    "fmla v31.4s, v6.4s, v11.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "27:"  // Oddments: Load input (3, 2): Bit 1: End
+    "fmla v30.4s, v8.4s, v12.4s\n"
+    "fmla v31.4s, v7.4s, v12.4s\n"
+    "fmax v28.4s, v28.4s, v27.4s\n"
+    "fmax v29.4s, v29.4s, v27.4s\n"
+    "fmax v30.4s, v30.4s, v27.4s\n"
+    "fmax v31.4s, v31.4s, v27.4s\n"
+    "fmin v28.4s, v28.4s, v26.4s\n"
+    "fmin v29.4s, v29.4s, v26.4s\n"
+    "fmin v30.4s, v30.4s, v26.4s\n"
+    "fmin v31.4s, v31.4s, v26.4s\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "st1 { v28.d }[0], [x12], #0x8\n"
+    "st1 { v29.d }[0], [x11], #0x8\n"
+    "st1 { v30.d }[0], [x10], #0x8\n"
+    "st1 { v31.d }[0], [x9], #0x8\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "st1 { v28.s }[2], [x12], #0x4\n"
+    "st1 { v29.s }[2], [x11], #0x4\n"
+    "st1 { v30.s }[2], [x10], #0x4\n"
+    "st1 { v31.s }[2], [x9], #0x4\n"
+    "b 29f\n"
+    "28:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v28.s }[0], [x12], #0x4\n"
+    "st1 { v29.s }[0], [x11], #0x4\n"
+    "st1 { v30.s }[0], [x10], #0x4\n"
+    "st1 { v31.s }[0], [x9], #0x4\n"
+    "29:"  // Oddments: Store: Bit 1: End
+    "30:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..c2d86615e3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..9bfcd9cd3c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,828 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x24, #0x0\n"
+    "mov x23, #0x0\n"
+    "1:"  // Tile loop
+    "str x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x27, #0x3\n"
+    "mov x26, #0x3\n"
+    "str x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x24, x25\n"  // offset = tile_i * ld_input_row
+    "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x24, x22\n"  // offset = tile_i * ld_output_row
+    "mov x24, #0x10\n"  // cntb _, ALL, #1
+    "madd x21, x23, x8, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x16, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "lsl x8, x8, #0x2\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "madd x20, x23, x17, x20\n"  // offset += tile_j * ld_output_col
+    "lsl x17, x17, #0x2\n"
+    "lsr x23, %x[n_channels], #0x2\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mul x21, x21, x27\n"  // offset *= kernel_stride * output_size
+    "add x16, x16, x21, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x13, x16, x25, LSL #2\n"
+    "mul x20, x20, x26\n"  // offset *= output_tile_size
+    "add x12, x13, x25, LSL #2\n"
+    "add x11, x8, x8\n"
+    "add x15, x15, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "add x10, x12, x25, LSL #2\n"
+    "add x9, x11, x8\n"
+    "add x28, x15, x22, LSL #2\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "add x27, x10, x25, LSL #2\n"
+    "add x26, x9, x8\n"
+    "add x25, x28, x22, LSL #2\n"
+    "add x22, x17, x17\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x24\n"
+    "cbz x23, 4f\n"
+    "ldr q31, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "cmp x24, x23, LSL #4\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "add x14, x14, #0xa0\n"
+    "ldr q9, [x12, x11]\n"
+    "ld1 { v10.4s }, [x16]\n"
+    "ldr q11, [x16, x26]\n"
+    "ld1 { v12.4s }, [x27]\n"
+    "ldr q13, [x13, x11]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v29.16b, v31.16b\n fmla v29.4s, v7.4s, v9.4s\n"
+    "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+    "add x24, x24, #0x10\n"
+    "cmp x24, x23, LSL #4\n"
+    "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+    "fmla v29.4s, v4.4s, v13.4s\n"
+    "add x20, x20, #0x10\n"
+    "add x21, x21, #0x10\n"
+    "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+    "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+    "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "ldr q23, [x12, x9]\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "ldr q18, [x12, x8]\n"
+    "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v6.4s, v18.4s\n"
+    "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v5.4s, v13.4s\n"
+    "fmla v27.4s, v3.4s, v13.4s\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "fmla v25.4s, v1.4s, v13.4s\n"
+    "fmla v24.4s, v0.4s, v13.4s\n"
+    "ldr q17, [x16, x8]\n"
+    "fmla v22.4s, v6.4s, v12.4s\n"
+    "ldr q16, [x27, x26]\n"
+    "mov v20.16b, v31.16b\n fmla v20.4s, v1.4s, v9.4s\n"
+    "ldr q31, [x14, #0x0]\n"
+    "fmla v29.4s, v0.4s, v17.4s\n"
+    "fmla v21.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x16, x9]\n"
+    "fmla v28.4s, v7.4s, v18.4s\n"
+    "fmla v20.4s, v0.4s, v18.4s\n"
+    "fmla v26.4s, v4.4s, v18.4s\n"
+    "fmla v25.4s, v3.4s, v18.4s\n"
+    "fmla v22.4s, v1.4s, v18.4s\n"
+    "ld1 { v19.4s }, [x13]\n"
+    "fmla v29.4s, v2.4s, v16.4s\n"
+    "fmla v27.4s, v1.4s, v16.4s\n"
+    "ld1 { v18.4s }, [x10]\n"
+    "fmla v24.4s, v4.4s, v23.4s\n"
+    "fmla v28.4s, v1.4s, v17.4s\n"
+    "ldr q16, [x13, x26]\n"
+    "fmla v20.4s, v2.4s, v23.4s\n"
+    "fmla v21.4s, v1.4s, v23.4s\n"
+    "fmla v29.4s, v8.4s, v23.4s\n"
+    "fmla v27.4s, v7.4s, v23.4s\n"
+    "fmla v25.4s, v5.4s, v23.4s\n"
+    "ldr q17, [x10, x11]\n"
+    "fmla v26.4s, v0.4s, v19.4s\n"
+    "fmla v22.4s, v3.4s, v18.4s\n"
+    "fmla v24.4s, v2.4s, v16.4s\n"
+    "fmla v20.4s, v4.4s, v17.4s\n"
+    "fmla v21.4s, v3.4s, v17.4s\n"
+    "fmla v28.4s, v3.4s, v19.4s\n"
+    "ldr q19, [x10, x26]\n"
+    "fmla v27.4s, v5.4s, v16.4s\n"
+    "ldr q16, [x27, x8]\n"
+    "fmla v26.4s, v6.4s, v18.4s\n"
+    "ldr q18, [x13, x8]\n"
+    "fmla v25.4s, v7.4s, v17.4s\n"
+    "fmla v22.4s, v5.4s, v17.4s\n"
+    "fmla v24.4s, v6.4s, v17.4s\n"
+    "fmla v21.4s, v5.4s, v19.4s\n"
+    "fmla v20.4s, v6.4s, v16.4s\n"
+    "fmla v26.4s, v8.4s, v17.4s\n"
+    "fmla v22.4s, v7.4s, v16.4s\n"
+    "ldr q17, [x27, x9]\n"
+    "fmla v29.4s, v3.4s, v18.4s\n"
+    "fmla v25.4s, v0.4s, v18.4s\n"
+    "fmla v24.4s, v8.4s, v19.4s\n"
+    "ldr q16, [x13, x9]\n"
+    "fmla v20.4s, v8.4s, v17.4s\n"
+    "add x13, x13, #0x10\n"
+    "fmla v21.4s, v7.4s, v17.4s\n"
+    "ldr q19, [x10, x9]\n"
+    "fmla v28.4s, v4.4s, v18.4s\n"
+    "fmla v26.4s, v1.4s, v18.4s\n"
+    "ldr q17, [x10, x8]\n"
+    "fmla v29.4s, v5.4s, v16.4s\n"
+    "add x10, x10, #0x10\n"
+    "fmla v27.4s, v4.4s, v16.4s\n"
+    "fmla v25.4s, v2.4s, v16.4s\n"
+    "fmla v24.4s, v1.4s, v16.4s\n"
+    "ldr q16, [x16, x11]\n"
+    "fmla v22.4s, v4.4s, v17.4s\n"
+    "add x16, x16, #0x10\n"
+    "ld1 { v10.4s }, [x16]\n"
+    "fmla v20.4s, v3.4s, v17.4s\n"
+    "fmla v21.4s, v4.4s, v19.4s\n"
+    "ldr q4, [x14, #0x50]\n"
+    "fmla v26.4s, v7.4s, v17.4s\n"
+    "fmla v25.4s, v6.4s, v17.4s\n"
+    "ld1 { v18.4s }, [x12]\n"
+    "fmla v28.4s, v2.4s, v16.4s\n"
+    "fmla v29.4s, v1.4s, v16.4s\n"
+    "ldr q1, [x14, #0x20]\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "fmla v27.4s, v0.4s, v16.4s\n"
+    "ldr q17, [x12, x26]\n"
+    "fmla v24.4s, v7.4s, v19.4s\n"
+    "add x12, x12, #0x10\n"
+    "ldr q9, [x12, x11]\n"
+    "fmla v20.4s, v5.4s, v19.4s\n"
+    "fmla v22.4s, v0.4s, v18.4s\n"
+    "ldr q0, [x14, #0x10]\n"
+    "fmla v21.4s, v2.4s, v17.4s\n"
+    "ldr q2, [x14, #0x30]\n"
+    "fmla v25.4s, v8.4s, v19.4s\n"
+    "ldr q16, [x27, x11]\n"
+    "fmla v28.4s, v6.4s, v18.4s\n"
+    "fmla v26.4s, v3.4s, v18.4s\n"
+    "ldr q3, [x14, #0x40]\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "fmla v27.4s, v8.4s, v17.4s\n"
+    "fmla v24.4s, v5.4s, v17.4s\n"
+    "ldr q11, [x16, x26]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "fmla v22.4s, v8.4s, v16.4s\n"
+    "ldr q8, [x14, #0x90]\n"
+    "fmla v20.4s, v7.4s, v16.4s\n"
+    "ldr q7, [x14, #0x80]\n"
+    "fmla v21.4s, v6.4s, v16.4s\n"
+    "ldr q13, [x13, x11]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "add x27, x27, #0x10\n"
+    "ld1 { v12.4s }, [x27]\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "fmax v22.4s, v22.4s, v15.4s\n"
+    "add x14, x14, #0xa0\n"
+    "fmax v20.4s, v20.4s, v15.4s\n"
+    "fmax v21.4s, v21.4s, v15.4s\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "st1 { v28.4s }, [x15]\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "str q29, [x15, x17]\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "str q27, [x15, x22]\n"
+    "add x15, x15, #0x10\n"
+    "fmin v22.4s, v22.4s, v14.4s\n"
+    "fmin v20.4s, v20.4s, v14.4s\n"
+    "st1 { v26.4s }, [x28]\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "str q25, [x28, x17]\n"
+    "str q24, [x28, x22]\n"
+    "add x28, x28, #0x10\n"
+    "st1 { v22.4s }, [x25]\n"
+    "str q20, [x25, x17]\n"
+    "str q21, [x25, x22]\n"
+    "add x25, x25, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v29.16b, v31.16b\n fmla v29.4s, v7.4s, v9.4s\n"
+    "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+    "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+    "fmla v29.4s, v4.4s, v13.4s\n"
+    "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+    "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+    "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "ldr q23, [x12, x9]\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "ldr q18, [x12, x8]\n"
+    "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v6.4s, v18.4s\n"
+    "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v5.4s, v13.4s\n"
+    "fmla v27.4s, v3.4s, v13.4s\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "fmla v25.4s, v1.4s, v13.4s\n"
+    "fmla v24.4s, v0.4s, v13.4s\n"
+    "ldr q17, [x16, x8]\n"
+    "fmla v22.4s, v6.4s, v12.4s\n"
+    "ldr q16, [x27, x26]\n"
+    "mov v20.16b, v31.16b\n fmla v20.4s, v1.4s, v9.4s\n"
+    "fmla v29.4s, v0.4s, v17.4s\n"
+    "fmla v21.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x16, x9]\n"
+    "fmla v28.4s, v7.4s, v18.4s\n"
+    "fmla v20.4s, v0.4s, v18.4s\n"
+    "fmla v26.4s, v4.4s, v18.4s\n"
+    "fmla v25.4s, v3.4s, v18.4s\n"
+    "fmla v22.4s, v1.4s, v18.4s\n"
+    "ld1 { v19.4s }, [x13]\n"
+    "fmla v29.4s, v2.4s, v16.4s\n"
+    "fmla v27.4s, v1.4s, v16.4s\n"
+    "ld1 { v18.4s }, [x10]\n"
+    "fmla v24.4s, v4.4s, v23.4s\n"
+    "fmla v28.4s, v1.4s, v17.4s\n"
+    "ldr q16, [x13, x26]\n"
+    "fmla v20.4s, v2.4s, v23.4s\n"
+    "fmla v21.4s, v1.4s, v23.4s\n"
+    "fmla v29.4s, v8.4s, v23.4s\n"
+    "fmla v27.4s, v7.4s, v23.4s\n"
+    "fmla v25.4s, v5.4s, v23.4s\n"
+    "ldr q17, [x10, x11]\n"
+    "fmla v26.4s, v0.4s, v19.4s\n"
+    "fmla v22.4s, v3.4s, v18.4s\n"
+    "fmla v24.4s, v2.4s, v16.4s\n"
+    "fmla v20.4s, v4.4s, v17.4s\n"
+    "fmla v21.4s, v3.4s, v17.4s\n"
+    "fmla v28.4s, v3.4s, v19.4s\n"
+    "ldr q19, [x10, x26]\n"
+    "fmla v27.4s, v5.4s, v16.4s\n"
+    "ldr q16, [x27, x8]\n"
+    "fmla v26.4s, v6.4s, v18.4s\n"
+    "ldr q18, [x13, x8]\n"
+    "fmla v25.4s, v7.4s, v17.4s\n"
+    "fmla v22.4s, v5.4s, v17.4s\n"
+    "fmla v24.4s, v6.4s, v17.4s\n"
+    "fmla v21.4s, v5.4s, v19.4s\n"
+    "fmla v20.4s, v6.4s, v16.4s\n"
+    "fmla v26.4s, v8.4s, v17.4s\n"
+    "fmla v22.4s, v7.4s, v16.4s\n"
+    "ldr q17, [x27, x9]\n"
+    "fmla v29.4s, v3.4s, v18.4s\n"
+    "fmla v25.4s, v0.4s, v18.4s\n"
+    "fmla v24.4s, v8.4s, v19.4s\n"
+    "ldr q16, [x13, x9]\n"
+    "fmla v20.4s, v8.4s, v17.4s\n"
+    "add x13, x13, #0x10\n"
+    "fmla v21.4s, v7.4s, v17.4s\n"
+    "ldr q19, [x10, x9]\n"
+    "fmla v28.4s, v4.4s, v18.4s\n"
+    "fmla v26.4s, v1.4s, v18.4s\n"
+    "ldr q17, [x10, x8]\n"
+    "fmla v29.4s, v5.4s, v16.4s\n"
+    "add x10, x10, #0x10\n"
+    "fmla v27.4s, v4.4s, v16.4s\n"
+    "fmla v25.4s, v2.4s, v16.4s\n"
+    "fmla v24.4s, v1.4s, v16.4s\n"
+    "ldr q16, [x16, x11]\n"
+    "fmla v22.4s, v4.4s, v17.4s\n"
+    "add x16, x16, #0x10\n"
+    "fmla v20.4s, v3.4s, v17.4s\n"
+    "fmla v21.4s, v4.4s, v19.4s\n"
+    "fmla v26.4s, v7.4s, v17.4s\n"
+    "fmla v25.4s, v6.4s, v17.4s\n"
+    "ld1 { v18.4s }, [x12]\n"
+    "fmla v28.4s, v2.4s, v16.4s\n"
+    "fmla v29.4s, v1.4s, v16.4s\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "fmla v27.4s, v0.4s, v16.4s\n"
+    "ldr q17, [x12, x26]\n"
+    "fmla v24.4s, v7.4s, v19.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "fmla v20.4s, v5.4s, v19.4s\n"
+    "fmla v22.4s, v0.4s, v18.4s\n"
+    "add x12, x12, #0x10\n"
+    "fmla v21.4s, v2.4s, v17.4s\n"
+    "fmla v25.4s, v8.4s, v19.4s\n"
+    "ldr q16, [x27, x11]\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "fmla v28.4s, v6.4s, v18.4s\n"
+    "fmla v26.4s, v3.4s, v18.4s\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "add x27, x27, #0x10\n"
+    "fmla v27.4s, v8.4s, v17.4s\n"
+    "fmla v24.4s, v5.4s, v17.4s\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "fmla v22.4s, v8.4s, v16.4s\n"
+    "fmla v20.4s, v7.4s, v16.4s\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "fmla v21.4s, v6.4s, v16.4s\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "fmax v22.4s, v22.4s, v15.4s\n"
+    "fmax v20.4s, v20.4s, v15.4s\n"
+    "fmax v21.4s, v21.4s, v15.4s\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "st1 { v28.4s }, [x15]\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "str q29, [x15, x17]\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "str q27, [x15, x22]\n"
+    "add x15, x15, #0x10\n"
+    "fmin v22.4s, v22.4s, v14.4s\n"
+    "fmin v20.4s, v20.4s, v14.4s\n"
+    "st1 { v26.4s }, [x28]\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "str q25, [x28, x17]\n"
+    "str q24, [x28, x22]\n"
+    "add x28, x28, #0x10\n"
+    "st1 { v22.4s }, [x25]\n"
+    "str q20, [x25, x17]\n"
+    "str q21, [x25, x22]\n"
+    "add x25, x25, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 49f\n"
+    "ldr q31, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "add x24, x12, x11\n"
+    "add x23, x16, XZR\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "add x22, x16, x26\n"
+    "add x21, x27, XZR\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "add x20, x13, x11\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr d9, [x24], #0x8\n"
+    "ldr d10, [x23], #0x8\n"
+    "ldr d11, [x22], #0x8\n"
+    "ldr d12, [x21], #0x8\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v9.s }[2], [x24]\n"
+    "ld1 { v10.s }[2], [x23]\n"
+    "ld1 { v11.s }[2], [x22]\n"
+    "ld1 { v12.s }[2], [x21]\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
+    "ldr s9, [x24, #0x0]\n"
+    "ldr s10, [x23, #0x0]\n"
+    "ldr s11, [x22, #0x0]\n"
+    "ldr s12, [x21, #0x0]\n"
+    "ldr s13, [x20, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
+    "mov v23.16b, v31.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+    "mov v25.16b, v31.16b\n fmla v25.4s, v6.4s, v9.4s\n"
+    "add x20, x27, x26\n"
+    "mov v24.16b, v31.16b\n fmla v24.4s, v7.4s, v9.4s\n"
+    "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+    "mov v27.16b, v31.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+    "mov v28.16b, v31.16b\n fmla v28.4s, v3.4s, v9.4s\n"
+    "mov v29.16b, v31.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+    "fmla v23.4s, v0.4s, v10.4s\n"
+    "fmla v25.4s, v2.4s, v11.4s\n"
+    "mov v30.16b, v31.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v9.4s\n"
+    "fmla v29.4s, v6.4s, v12.4s\n"
+    "fmla v23.4s, v5.4s, v13.4s\n"
+    "fmla v24.4s, v4.4s, v13.4s\n"
+    "fmla v25.4s, v3.4s, v13.4s\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "fmla v27.4s, v1.4s, v13.4s\n"
+    "fmla v28.4s, v0.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+    "fmla v31.4s, v8.4s, v12.4s\n"
+    "add x20, x12, x8\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+    "fmla v23.4s, v7.4s, v11.4s\n"
+    "fmla v24.4s, v6.4s, v11.4s\n"
+    "add x20, x16, x8\n"
+    "fmla v26.4s, v4.4s, v11.4s\n"
+    "fmla v27.4s, v3.4s, v11.4s\n"
+    "fmla v29.4s, v1.4s, v11.4s\n"
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+    "fmla v23.4s, v1.4s, v13.4s\n"
+    "fmla v24.4s, v0.4s, v13.4s\n"
+    "add x20, x16, x9\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
+    "fmla v24.4s, v2.4s, v12.4s\n"
+    "fmla v25.4s, v1.4s, v12.4s\n"
+    "add x20, x12, x9\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+    "fmla v24.4s, v8.4s, v10.4s\n"
+    "fmla v25.4s, v7.4s, v10.4s\n"
+    "add x20, x13, XZR\n"
+    "fmla v27.4s, v5.4s, v10.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "fmla v30.4s, v2.4s, v10.4s\n"
+    "fmla v31.4s, v1.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+    "fmla v23.4s, v3.4s, v11.4s\n"
+    "fmla v26.4s, v0.4s, v11.4s\n"
+    "add x20, x13, x26\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+    "fmla v25.4s, v5.4s, v13.4s\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "add x20, x10, XZR\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v26.4s, v6.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "add x20, x10, x11\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v26.4s, v8.4s, v10.4s\n"
+    "fmla v27.4s, v7.4s, v10.4s\n"
+    "add x20, x10, x26\n"
+    "fmla v28.4s, v6.4s, v10.4s\n"
+    "fmla v29.4s, v5.4s, v10.4s\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v31.4s, v3.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+    "fmla v28.4s, v8.4s, v11.4s\n"
+    "fmla v31.4s, v5.4s, v11.4s\n"
+    "add x20, x27, x8\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+    "fmla v29.4s, v7.4s, v13.4s\n"
+    "fmla v30.4s, v6.4s, v13.4s\n"
+    "add x20, x13, x8\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
+    "fmla v23.4s, v4.4s, v12.4s\n"
+    "fmla v24.4s, v3.4s, v12.4s\n"
+    "add x20, x13, x9\n"
+    "fmla v26.4s, v1.4s, v12.4s\n"
+    "fmla v27.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v24.4s, v5.4s, v11.4s\n"
+    "fmla v25.4s, v4.4s, v11.4s\n"
+    "add x20, x27, x9\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 34f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 34f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+    "fmla v30.4s, v8.4s, v13.4s\n"
+    "fmla v31.4s, v7.4s, v13.4s\n"
+    "add x20, x10, x8\n"
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v26.4s, v7.4s, v12.4s\n"
+    "fmla v27.4s, v6.4s, v12.4s\n"
+    "add x20, x16, x11\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 38f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 38f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "fmla v24.4s, v1.4s, v11.4s\n"
+    "add x20, x10, x9\n"
+    "fmla v25.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v27.4s, v8.4s, v13.4s\n"
+    "fmla v28.4s, v7.4s, v13.4s\n"
+    "add x20, x12, XZR\n"
+    "fmla v30.4s, v5.4s, v13.4s\n"
+    "fmla v31.4s, v4.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 42f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 42f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+    "fmla v23.4s, v6.4s, v12.4s\n"
+    "fmla v26.4s, v3.4s, v12.4s\n"
+    "add x20, x12, x26\n"
+    "fmla v29.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+    "fmla v25.4s, v8.4s, v11.4s\n"
+    "fmla v28.4s, v5.4s, v11.4s\n"
+    "add x20, x27, x11\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 46f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 46f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+    "fmla v29.4s, v8.4s, v13.4s\n"
+    "fmla v30.4s, v7.4s, v13.4s\n"
+    "fmax v23.4s, v23.4s, v15.4s\n"
+    "fmla v31.4s, v6.4s, v13.4s\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "fmax v30.4s, v30.4s, v15.4s\n"
+    "fmax v31.4s, v31.4s, v15.4s\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "fmin v30.4s, v30.4s, v14.4s\n"
+    "fmin v31.4s, v31.4s, v14.4s\n"
+    "tbz %x[n_channels], #1, 47f\n"
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.d }[0], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.d }[0], [x21], x17\n"
+    "add x15, x15, #0x8\n"
+    "st1 { v29.d }[0], [x20], x17\n"
+    "add x28, x28, #0x8\n"
+    "add x25, x25, #0x8\n"
+    "st1 { v24.d }[0], [x22], x17\n"
+    "st1 { v27.d }[0], [x21], x17\n"
+    "st1 { v30.d }[0], [x20], x17\n"
+    "st1 { v25.d }[0], [x22]\n"
+    "st1 { v28.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.s }[2], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.s }[2], [x21], x17\n"
+    "st1 { v29.s }[2], [x20], x17\n"
+    "st1 { v24.s }[2], [x22], x17\n"
+    "st1 { v27.s }[2], [x21], x17\n"
+    "st1 { v30.s }[2], [x20], x17\n"
+    "st1 { v25.s }[2], [x22]\n"
+    "st1 { v28.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.s }[0], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.s }[0], [x21], x17\n"
+    "st1 { v29.s }[0], [x20], x17\n"
+    "st1 { v24.s }[0], [x22], x17\n"
+    "st1 { v27.s }[0], [x21], x17\n"
+    "st1 { v30.s }[0], [x20], x17\n"
+    "st1 { v25.s }[0], [x22]\n"
+    "st1 { v28.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "48:"  // Tile loop: Oddments: Store: Bit 1: End
+    "49:"  // Tile loop: End
+    "ldr x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x23, x23, #0x1\n"
+    "add x21, x24, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x23, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x24, x24, x21, LT\n"
+    "csel x23, x23, XZR, LT\n"
+    "cmp x24, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..972f7eb535
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,905 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[25];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[4];
+      inptrs[3] = input_ptrs[20];
+      inptrs[4] = input_ptrs[7];
+      inptrs[5] = input_ptrs[24];
+      inptrs[6] = input_ptrs[11];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[3];
+      inptrs[9] = input_ptrs[13];
+      inptrs[10] = input_ptrs[5];
+      inptrs[11] = input_ptrs[9];
+      inptrs[12] = input_ptrs[15];
+      inptrs[13] = input_ptrs[17];
+      inptrs[14] = input_ptrs[19];
+      inptrs[15] = input_ptrs[21];
+      inptrs[16] = input_ptrs[6];
+      inptrs[17] = input_ptrs[8];
+      inptrs[18] = input_ptrs[23];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[18];
+      inptrs[22] = input_ptrs[10];
+      inptrs[23] = input_ptrs[14];
+      inptrs[24] = input_ptrs[22];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "mov x7, #0x10\n"  // cntb _, ALL, #1
+    "lsr x8, %x[n_channels], #0x2\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "mov x14, #0x0\n"
+    "sub x13, XZR, x7\n"
+    "cbz x8, 3f\n"
+    "ldr q31, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "cmp x7, x8, LSL #4\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "ldr q5, [x16, #0x60]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "ldr q7, [x16, #0x80]\n"
+    "ldr q8, [x16, #0x90]\n"
+    "add x16, x16, #0xa0\n"
+    "ldp x21, x20, [x15, #0x0]\n"
+    "ldr q9, [x21, x14]\n"
+    "ldr q10, [x20, x14]\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr q11, [x21, x14]\n"
+    "ldr q12, [x20, x14]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ldr q13, [x20, x14]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+    "mov v28.16b, v31.16b\n fmla v28.4s, v7.4s, v9.4s\n"
+    "ldr x26, [x15, #0x30]\n"
+    "ldr x23, [x15, #0x38]\n"
+    "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+    "fmla v29.4s, v0.4s, v10.4s\n"
+    "ldr x22, [x15, #0x28]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "ldr q19, [x20, x14]\n"
+    "fmla v28.4s, v4.4s, v13.4s\n"
+    "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+    "ldr x21, [x15, #0x40]\n"
+    "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+    "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+    "ldr x25, [x15, #0x50]\n"
+    "ldr x24, [x15, #0x58]\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "ldr q17, [x26, x14]\n"
+    "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+    "ldr x20, [x15, #0x60]\n"
+    "fmla v29.4s, v5.4s, v13.4s\n"
+    "fmla v28.4s, v6.4s, v17.4s\n"
+    "ldr x12, [x15, #0x70]\n"
+    "ldr x11, [x15, #0x88]\n"
+    "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
+    "fmla v27.4s, v3.4s, v13.4s\n"
+    "ldr x10, [x17, #0x0]\n"
+    "add x13, x13, #0x10\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "fmla v25.4s, v1.4s, v13.4s\n"
+    "ldr x9, [x17, #0x8]\n"
+    "ldr x28, [x17, #0x10]\n"
+    "fmla v24.4s, v0.4s, v13.4s\n"
+    "ldr q18, [x23, x14]\n"
+    "fmla v23.4s, v6.4s, v12.4s\n"
+    "ldr q16, [x22, x14]\n"
+    "mov v21.16b, v31.16b\n fmla v21.4s, v1.4s, v9.4s\n"
+    "ldr q31, [x16, #0x0]\n"
+    "fmla v29.4s, v7.4s, v17.4s\n"
+    "ldr x23, [x15, #0x68]\n"
+    "fmla v28.4s, v0.4s, v18.4s\n"
+    "fmla v22.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x21, x14]\n"
+    "ldr x22, [x15, #0x78]\n"
+    "fmla v26.4s, v4.4s, v17.4s\n"
+    "fmla v25.4s, v3.4s, v17.4s\n"
+    "ldr x21, [x15, #0x80]\n"
+    "ldr x27, [x17, #0x18]\n"
+    "fmla v21.4s, v0.4s, v17.4s\n"
+    "fmla v24.4s, v4.4s, v19.4s\n"
+    "fmla v23.4s, v1.4s, v17.4s\n"
+    "ldr q17, [x25, x14]\n"
+    "fmla v29.4s, v1.4s, v18.4s\n"
+    "ldr q20, [x24, x14]\n"
+    "fmla v28.4s, v2.4s, v16.4s\n"
+    "fmla v27.4s, v1.4s, v16.4s\n"
+    "ldr q16, [x20, x14]\n"
+    "ldr x26, [x15, #0x90]\n"
+    "fmla v25.4s, v5.4s, v19.4s\n"
+    "fmla v21.4s, v2.4s, v19.4s\n"
+    "ldr x25, [x15, #0xa0]\n"
+    "ldr x20, [x15, #0x98]\n"
+    "fmla v26.4s, v0.4s, v17.4s\n"
+    "fmla v24.4s, v2.4s, v20.4s\n"
+    "fmla v28.4s, v8.4s, v19.4s\n"
+    "fmla v27.4s, v7.4s, v19.4s\n"
+    "fmla v22.4s, v1.4s, v19.4s\n"
+    "ldr q19, [x23, x14]\n"
+    "fmla v23.4s, v3.4s, v16.4s\n"
+    "ldr x24, [x15, #0xa8]\n"
+    "fmla v26.4s, v6.4s, v16.4s\n"
+    "ldr q18, [x21, x14]\n"
+    "fmla v25.4s, v7.4s, v19.4s\n"
+    "ldr x23, [x15, #0xc0]\n"
+    "fmla v24.4s, v6.4s, v19.4s\n"
+    "fmla v21.4s, v4.4s, v19.4s\n"
+    "fmla v29.4s, v3.4s, v17.4s\n"
+    "ldr q17, [x12, x14]\n"
+    "fmla v27.4s, v5.4s, v20.4s\n"
+    "ldr q16, [x22, x14]\n"
+    "fmla v23.4s, v5.4s, v19.4s\n"
+    "fmla v22.4s, v3.4s, v19.4s\n"
+    "ldr x22, [x15, #0xb0]\n"
+    "ldr x21, [x15, #0xb8]\n"
+    "fmla v26.4s, v8.4s, v19.4s\n"
+    "fmla v24.4s, v8.4s, v17.4s\n"
+    "fmla v21.4s, v6.4s, v16.4s\n"
+    "fmla v28.4s, v3.4s, v18.4s\n"
+    "fmla v25.4s, v0.4s, v18.4s\n"
+    "fmla v22.4s, v5.4s, v17.4s\n"
+    "ldr q17, [x11, x14]\n"
+    "fmla v23.4s, v7.4s, v16.4s\n"
+    "ldr q16, [x26, x14]\n"
+    "fmla v29.4s, v4.4s, v18.4s\n"
+    "fmla v26.4s, v1.4s, v18.4s\n"
+    "ldr q18, [x20, x14]\n"
+    "fmla v28.4s, v5.4s, v17.4s\n"
+    "fmla v27.4s, v4.4s, v17.4s\n"
+    "fmla v25.4s, v2.4s, v17.4s\n"
+    "fmla v24.4s, v1.4s, v17.4s\n"
+    "ldr q17, [x25, x14]\n"
+    "fmla v21.4s, v8.4s, v16.4s\n"
+    "ldr x20, [x15, #0x20]\n"
+    "fmla v22.4s, v7.4s, v16.4s\n"
+    "ldr q16, [x24, x14]\n"
+    "fmla v29.4s, v2.4s, v17.4s\n"
+    "fmla v26.4s, v7.4s, v18.4s\n"
+    "fmla v25.4s, v6.4s, v18.4s\n"
+    "fmla v23.4s, v4.4s, v18.4s\n"
+    "fmla v21.4s, v3.4s, v18.4s\n"
+    "ldr q18, [x22, x14]\n"
+    "fmla v22.4s, v4.4s, v16.4s\n"
+    "ldr q4, [x16, #0x50]\n"
+    "fmla v28.4s, v1.4s, v17.4s\n"
+    "ldr q1, [x16, #0x20]\n"
+    "fmla v27.4s, v0.4s, v17.4s\n"
+    "ldr q17, [x21, x14]\n"
+    "fmla v29.4s, v6.4s, v18.4s\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "fmla v24.4s, v7.4s, v16.4s\n"
+    "fmla v21.4s, v5.4s, v16.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "str q29, [x10, x13]\n"
+    "fmla v23.4s, v0.4s, v18.4s\n"
+    "ldr q0, [x16, #0x10]\n"
+    "fmla v22.4s, v2.4s, v17.4s\n"
+    "ldr q2, [x16, #0x30]\n"
+    "fmla v25.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x23, x14]\n"
+    "fmla v26.4s, v3.4s, v18.4s\n"
+    "ldr q3, [x16, #0x40]\n"
+    "fmla v27.4s, v8.4s, v17.4s\n"
+    "fmla v24.4s, v5.4s, v17.4s\n"
+    "ldr q5, [x16, #0x60]\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "fmla v23.4s, v8.4s, v16.4s\n"
+    "ldr q8, [x16, #0x90]\n"
+    "fmla v21.4s, v7.4s, v16.4s\n"
+    "ldr q7, [x16, #0x80]\n"
+    "fmla v22.4s, v6.4s, v16.4s\n"
+    "ldr q13, [x20, x7]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "ldr x24, [x17, #0x20]\n"
+    "ldp x21, x20, [x15, #0x0]\n"
+    "ldr q9, [x21, x7]\n"
+    "ldr q10, [x20, x7]\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr q11, [x21, x7]\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "ldr q12, [x20, x7]\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "fmax v23.4s, v23.4s, v15.4s\n"
+    "str q28, [x9, x13]\n"
+    "fmax v21.4s, v21.4s, v15.4s\n"
+    "fmax v22.4s, v22.4s, v15.4s\n"
+    "str q27, [x28, x13]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "str q26, [x27, x13]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x7, x7, #0x10\n"
+    "str q25, [x24, x13]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "cmp x7, x8, LSL #4\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "add x14, x14, #0x10\n"
+    "str q24, [x23, x13]\n"
+    "fmin v22.4s, v22.4s, v14.4s\n"
+    "str q23, [x22, x13]\n"
+    "add x16, x16, #0xa0\n"
+    "str q21, [x21, x13]\n"
+    "str q22, [x20, x13]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+    "mov v28.16b, v31.16b\n fmla v28.4s, v7.4s, v9.4s\n"
+    "ldr x23, [x15, #0x30]\n"
+    "ldr x22, [x15, #0x38]\n"
+    "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+    "fmla v29.4s, v0.4s, v10.4s\n"
+    "ldr x21, [x15, #0x28]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "ldr q19, [x20, x14]\n"
+    "fmla v28.4s, v4.4s, v13.4s\n"
+    "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+    "ldr x20, [x15, #0x40]\n"
+    "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+    "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+    "ldr x25, [x15, #0x50]\n"
+    "ldr x24, [x15, #0x58]\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "ldr q17, [x23, x14]\n"
+    "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+    "ldr x23, [x15, #0x60]\n"
+    "fmla v29.4s, v5.4s, v13.4s\n"
+    "fmla v28.4s, v6.4s, v17.4s\n"
+    "ldr x12, [x15, #0x70]\n"
+    "ldr x11, [x15, #0x88]\n"
+    "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
+    "fmla v27.4s, v3.4s, v13.4s\n"
+    "ldr x10, [x17, #0x0]\n"
+    "add x13, x13, #0x10\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "fmla v25.4s, v1.4s, v13.4s\n"
+    "ldr x9, [x17, #0x8]\n"
+    "ldr x28, [x17, #0x10]\n"
+    "fmla v24.4s, v0.4s, v13.4s\n"
+    "ldr q18, [x22, x14]\n"
+    "fmla v23.4s, v6.4s, v12.4s\n"
+    "ldr q16, [x21, x14]\n"
+    "mov v21.16b, v31.16b\n fmla v21.4s, v1.4s, v9.4s\n"
+    "fmla v29.4s, v7.4s, v17.4s\n"
+    "ldr x22, [x15, #0x68]\n"
+    "ldr x21, [x15, #0x78]\n"
+    "fmla v28.4s, v0.4s, v18.4s\n"
+    "fmla v22.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x20, x14]\n"
+    "ldr x20, [x15, #0x80]\n"
+    "fmla v26.4s, v4.4s, v17.4s\n"
+    "fmla v25.4s, v3.4s, v17.4s\n"
+    "ldr x27, [x17, #0x18]\n"
+    "fmla v21.4s, v0.4s, v17.4s\n"
+    "fmla v24.4s, v4.4s, v19.4s\n"
+    "fmla v23.4s, v1.4s, v17.4s\n"
+    "ldr q17, [x25, x14]\n"
+    "fmla v29.4s, v1.4s, v18.4s\n"
+    "ldr q20, [x24, x14]\n"
+    "fmla v28.4s, v2.4s, v16.4s\n"
+    "fmla v27.4s, v1.4s, v16.4s\n"
+    "ldr q16, [x23, x14]\n"
+    "ldr x26, [x15, #0x90]\n"
+    "fmla v25.4s, v5.4s, v19.4s\n"
+    "fmla v21.4s, v2.4s, v19.4s\n"
+    "ldr x25, [x15, #0xa0]\n"
+    "ldr x24, [x15, #0x98]\n"
+    "fmla v26.4s, v0.4s, v17.4s\n"
+    "fmla v24.4s, v2.4s, v20.4s\n"
+    "fmla v28.4s, v8.4s, v19.4s\n"
+    "fmla v27.4s, v7.4s, v19.4s\n"
+    "fmla v22.4s, v1.4s, v19.4s\n"
+    "ldr q19, [x22, x14]\n"
+    "fmla v23.4s, v3.4s, v16.4s\n"
+    "ldr x23, [x15, #0xa8]\n"
+    "fmla v26.4s, v6.4s, v16.4s\n"
+    "ldr q18, [x20, x14]\n"
+    "fmla v25.4s, v7.4s, v19.4s\n"
+    "ldr x22, [x15, #0xc0]\n"
+    "fmla v24.4s, v6.4s, v19.4s\n"
+    "fmla v21.4s, v4.4s, v19.4s\n"
+    "fmla v29.4s, v3.4s, v17.4s\n"
+    "ldr q17, [x12, x14]\n"
+    "fmla v27.4s, v5.4s, v20.4s\n"
+    "ldr q16, [x21, x14]\n"
+    "fmla v23.4s, v5.4s, v19.4s\n"
+    "fmla v22.4s, v3.4s, v19.4s\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v26.4s, v8.4s, v19.4s\n"
+    "fmla v24.4s, v8.4s, v17.4s\n"
+    "fmla v21.4s, v6.4s, v16.4s\n"
+    "fmla v28.4s, v3.4s, v18.4s\n"
+    "fmla v25.4s, v0.4s, v18.4s\n"
+    "fmla v22.4s, v5.4s, v17.4s\n"
+    "ldr q17, [x11, x14]\n"
+    "fmla v23.4s, v7.4s, v16.4s\n"
+    "ldr q16, [x26, x14]\n"
+    "fmla v29.4s, v4.4s, v18.4s\n"
+    "fmla v26.4s, v1.4s, v18.4s\n"
+    "ldr q18, [x24, x14]\n"
+    "fmla v28.4s, v5.4s, v17.4s\n"
+    "fmla v27.4s, v4.4s, v17.4s\n"
+    "fmla v25.4s, v2.4s, v17.4s\n"
+    "fmla v24.4s, v1.4s, v17.4s\n"
+    "ldr q17, [x25, x14]\n"
+    "fmla v21.4s, v8.4s, v16.4s\n"
+    "fmla v22.4s, v7.4s, v16.4s\n"
+    "ldr q16, [x23, x14]\n"
+    "fmla v29.4s, v2.4s, v17.4s\n"
+    "fmla v26.4s, v7.4s, v18.4s\n"
+    "fmla v25.4s, v6.4s, v18.4s\n"
+    "fmla v23.4s, v4.4s, v18.4s\n"
+    "fmla v21.4s, v3.4s, v18.4s\n"
+    "ldr q18, [x21, x14]\n"
+    "fmla v22.4s, v4.4s, v16.4s\n"
+    "fmla v28.4s, v1.4s, v17.4s\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "fmla v27.4s, v0.4s, v17.4s\n"
+    "ldr q17, [x20, x14]\n"
+    "fmla v29.4s, v6.4s, v18.4s\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "fmla v24.4s, v7.4s, v16.4s\n"
+    "fmla v21.4s, v5.4s, v16.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "str q29, [x10, x13]\n"
+    "fmla v23.4s, v0.4s, v18.4s\n"
+    "fmla v22.4s, v2.4s, v17.4s\n"
+    "ldr x20, [x17, #0x20]\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "fmla v25.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x22, x14]\n"
+    "fmla v26.4s, v3.4s, v18.4s\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "fmla v27.4s, v8.4s, v17.4s\n"
+    "fmla v24.4s, v5.4s, v17.4s\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "str q28, [x9, x13]\n"
+    "fmla v23.4s, v8.4s, v16.4s\n"
+    "fmla v21.4s, v7.4s, v16.4s\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "ldr x23, [x17, #0x28]\n"
+    "fmla v22.4s, v6.4s, v16.4s\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "str q27, [x28, x13]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "str q26, [x27, x13]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "fmax v23.4s, v23.4s, v15.4s\n"
+    "str q25, [x20, x13]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "fmax v21.4s, v21.4s, v15.4s\n"
+    "fmax v22.4s, v22.4s, v15.4s\n"
+    "add x14, x14, #0x10\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "str q24, [x23, x13]\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "fmin v22.4s, v22.4s, v14.4s\n"
+    "str q23, [x22, x13]\n"
+    "str q21, [x21, x13]\n"
+    "str q22, [x20, x13]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 48f\n"
+    "ldr q31, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "mov x13, x14\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "ldr q5, [x16, #0x60]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "ldr q7, [x16, #0x80]\n"
+    "ldr q8, [x16, #0x90]\n"
+    "ldr x24, [x15, #0x0]\n"
+    "ldr x23, [x15, #0x8]\n"
+    "add x24, x24, x14\n"
+    "add x23, x23, x14\n"
+    "ldr x22, [x15, #0x10]\n"
+    "ldr x21, [x15, #0x18]\n"
+    "add x22, x22, x14\n"
+    "add x21, x21, x14\n"
+    "ldr x20, [x15, #0x20]\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.d }[0], [x24], #0x8\n"
+    "ld1 { v10.d }[0], [x23], #0x8\n"
+    "ld1 { v11.d }[0], [x22], #0x8\n"
+    "ld1 { v12.d }[0], [x21], #0x8\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v9.s }[2], [x24], #0x4\n"
+    "ld1 { v10.s }[2], [x23], #0x4\n"
+    "ld1 { v11.s }[2], [x22], #0x4\n"
+    "ld1 { v12.s }[2], [x21], #0x4\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
+    "ld1 { v9.s }[0], [x24], #0x4\n"
+    "ld1 { v10.s }[0], [x23], #0x4\n"
+    "ld1 { v11.s }[0], [x22], #0x4\n"
+    "ld1 { v12.s }[0], [x21], #0x4\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
+    "mov v23.16b, v31.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+    "mov v25.16b, v31.16b\n fmla v25.4s, v6.4s, v9.4s\n"
+    "ldr x20, [x15, #0x28]\n"
+    "add x20, x20, x14\n"
+    "mov v24.16b, v31.16b\n fmla v24.4s, v7.4s, v9.4s\n"
+    "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+    "mov v27.16b, v31.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+    "mov v28.16b, v31.16b\n fmla v28.4s, v3.4s, v9.4s\n"
+    "mov v29.16b, v31.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+    "fmla v23.4s, v0.4s, v10.4s\n"
+    "fmla v25.4s, v2.4s, v11.4s\n"
+    "mov v30.16b, v31.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v9.4s\n"
+    "fmla v29.4s, v6.4s, v12.4s\n"
+    "fmla v23.4s, v5.4s, v13.4s\n"
+    "fmla v24.4s, v4.4s, v13.4s\n"
+    "fmla v25.4s, v3.4s, v13.4s\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "fmla v27.4s, v1.4s, v13.4s\n"
+    "fmla v28.4s, v0.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (4, 4): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "7:"  // Oddments: Load input (4, 4): Bit 1: End
+    "ldr x20, [x15, #0x30]\n"
+    "fmla v31.4s, v8.4s, v12.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (2, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "9:"  // Oddments: Load input (2, 1): Bit 1: End
+    "ldr x20, [x15, #0x38]\n"
+    "fmla v23.4s, v7.4s, v11.4s\n"
+    "fmla v24.4s, v6.4s, v11.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v26.4s, v4.4s, v11.4s\n"
+    "fmla v27.4s, v3.4s, v11.4s\n"
+    "fmla v29.4s, v1.4s, v11.4s\n"
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (0, 1): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "11:"  // Oddments: Load input (0, 1): Bit 1: End
+    "ldr x20, [x15, #0x40]\n"
+    "fmla v23.4s, v1.4s, v13.4s\n"
+    "fmla v24.4s, v0.4s, v13.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (0, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "13:"  // Oddments: Load input (0, 3): Bit 1: End
+    "ldr x20, [x15, #0x48]\n"
+    "fmla v24.4s, v2.4s, v12.4s\n"
+    "fmla v25.4s, v1.4s, v12.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 3): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "15:"  // Oddments: Load input (2, 3): Bit 1: End
+    "ldr x20, [x15, #0x50]\n"
+    "fmla v24.4s, v8.4s, v10.4s\n"
+    "fmla v25.4s, v7.4s, v10.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v27.4s, v5.4s, v10.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "fmla v30.4s, v2.4s, v10.4s\n"
+    "fmla v31.4s, v1.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (1, 0): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "17:"  // Oddments: Load input (1, 0): Bit 1: End
+    "ldr x20, [x15, #0x58]\n"
+    "fmla v23.4s, v3.4s, v11.4s\n"
+    "fmla v26.4s, v0.4s, v11.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (1, 4): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "19:"  // Oddments: Load input (1, 4): Bit 1: End
+    "ldr x20, [x15, #0x60]\n"
+    "fmla v25.4s, v5.4s, v13.4s\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "21:"  // Oddments: Load input (3, 0): Bit 1: End
+    "ldr x20, [x15, #0x68]\n"
+    "fmla v26.4s, v6.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "23:"  // Oddments: Load input (3, 2): Bit 1: End
+    "ldr x20, [x15, #0x70]\n"
+    "fmla v26.4s, v8.4s, v10.4s\n"
+    "fmla v27.4s, v7.4s, v10.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v28.4s, v6.4s, v10.4s\n"
+    "fmla v29.4s, v5.4s, v10.4s\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v31.4s, v3.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (3, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "25:"  // Oddments: Load input (3, 4): Bit 1: End
+    "ldr x20, [x15, #0x78]\n"
+    "fmla v28.4s, v8.4s, v11.4s\n"
+    "fmla v31.4s, v5.4s, v11.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (4, 1): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "27:"  // Oddments: Load input (4, 1): Bit 1: End
+    "ldr x20, [x15, #0x80]\n"
+    "fmla v29.4s, v7.4s, v13.4s\n"
+    "fmla v30.4s, v6.4s, v13.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 29f\n"
+    "28:"  // Oddments: Load input (1, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "29:"  // Oddments: Load input (1, 1): Bit 1: End
+    "ldr x20, [x15, #0x88]\n"
+    "fmla v23.4s, v4.4s, v12.4s\n"
+    "fmla v24.4s, v3.4s, v12.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v26.4s, v1.4s, v12.4s\n"
+    "fmla v27.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "31:"  // Oddments: Load input (1, 3): Bit 1: End
+    "ldr x20, [x15, #0x90]\n"
+    "fmla v24.4s, v5.4s, v11.4s\n"
+    "fmla v25.4s, v4.4s, v11.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 33f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 33f\n"
+    "32:"  // Oddments: Load input (4, 3): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "33:"  // Oddments: Load input (4, 3): Bit 1: End
+    "ldr x20, [x15, #0x98]\n"
+    "fmla v30.4s, v8.4s, v13.4s\n"
+    "fmla v31.4s, v7.4s, v13.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "35:"  // Oddments: Load input (3, 1): Bit 1: End
+    "ldr x20, [x15, #0xa0]\n"
+    "fmla v26.4s, v7.4s, v12.4s\n"
+    "fmla v27.4s, v6.4s, v12.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 37f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 37f\n"
+    "36:"  // Oddments: Load input (0, 2): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "37:"  // Oddments: Load input (0, 2): Bit 1: End
+    "ldr x20, [x15, #0xa8]\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "fmla v24.4s, v1.4s, v11.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v25.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "39:"  // Oddments: Load input (3, 3): Bit 1: End
+    "ldr x20, [x15, #0xb0]\n"
+    "fmla v27.4s, v8.4s, v13.4s\n"
+    "fmla v28.4s, v7.4s, v13.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v30.4s, v5.4s, v13.4s\n"
+    "fmla v31.4s, v4.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 41f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 41f\n"
+    "40:"  // Oddments: Load input (2, 0): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "41:"  // Oddments: Load input (2, 0): Bit 1: End
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v23.4s, v6.4s, v12.4s\n"
+    "fmla v26.4s, v3.4s, v12.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v29.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (2, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "43:"  // Oddments: Load input (2, 4): Bit 1: End
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla v25.4s, v8.4s, v11.4s\n"
+    "fmla v28.4s, v5.4s, v11.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 45f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 45f\n"
+    "44:"  // Oddments: Load input (4, 2): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "45:"  // Oddments: Load input (4, 2): Bit 1: End
+    "fmla v29.4s, v8.4s, v13.4s\n"
+    "fmla v30.4s, v7.4s, v13.4s\n"
+    "fmax v23.4s, v23.4s, v15.4s\n"
+    "fmla v31.4s, v6.4s, v13.4s\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "fmax v30.4s, v30.4s, v15.4s\n"
+    "fmax v31.4s, v31.4s, v15.4s\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "fmin v30.4s, v30.4s, v14.4s\n"
+    "fmin v31.4s, v31.4s, v14.4s\n"
+    "tbz %x[n_channels], #1, 46f\n"
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.d }[0], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.d }[0], [x23]\n"
+    "st1 { v25.d }[0], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.d }[0], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.d }[0], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "add x13, x13, #0x8\n"
+    "st1 { v28.d }[0], [x23]\n"
+    "st1 { v29.d }[0], [x22]\n"
+    "st1 { v30.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.s }[2], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.s }[2], [x23]\n"
+    "st1 { v25.s }[2], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.s }[2], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.s }[2], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "st1 { v28.s }[2], [x23]\n"
+    "st1 { v29.s }[2], [x22]\n"
+    "st1 { v30.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Store: Bit 1: Unset
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.s }[0], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.s }[0], [x23]\n"
+    "st1 { v25.s }[0], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.s }[0], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.s }[0], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "st1 { v28.s }[0], [x23]\n"
+    "st1 { v29.s }[0], [x22]\n"
+    "st1 { v30.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "47:"  // Oddments: Store: Bit 1: End
+    "48:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..8a198c1818
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 4;
+
+  a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..3adf8b0d9f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,1232 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x27, #0x0\n"
+    "mov x26, #0x0\n"
+    "1:"  // Tile loop
+    "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x4\n"
+    "mov x23, #0x4\n"
+    "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x27, x24\n"  // offset = tile_i * ld_input_row
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x27, x22\n"  // offset = tile_i * ld_output_row
+    "mov x6, #0x10\n"  // cntb _, ALL, #1
+    "madd x21, x26, x4, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "lsl x4, x4, #0x2\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "madd x20, x26, x5, x20\n"  // offset += tile_j * ld_output_col
+    "lsl x5, x5, #0x2\n"
+    "add x17, x4, x4\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mul x21, x21, x25\n"  // offset *= kernel_stride * output_size
+    "add x7, x7, x21, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x15, x7, x24, LSL #2\n"
+    "mul x20, x20, x23\n"  // offset *= output_tile_size
+    "add x14, x15, x24, LSL #2\n"
+    "add x8, x8, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "lsr x13, %x[n_channels], #0x2\n"
+    "add x12, x14, x24, LSL #2\n"
+    "add x11, x17, x4\n"
+    "add x10, x8, x22, LSL #2\n"
+    "add x9, x12, x24, LSL #2\n"
+    "add x28, x11, x4\n"
+    "add x27, x10, x22, LSL #2\n"
+    "add x23, x5, x5\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v13.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "add x26, x9, x24, LSL #2\n"
+    "add x25, x28, x4\n"
+    "add x24, x27, x22, LSL #2\n"
+    "add x22, x23, x5\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x6\n"
+    "cbz x13, 4f\n"
+    "ldr q14, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "cmp x6, x13, LSL #4\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "ldr q5, [x16, #0x60]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "ldr q7, [x16, #0x80]\n"
+    "ldr q8, [x16, #0x90]\n"
+    "add x16, x16, #0xa0\n"
+    "ldr q9, [x14, x17]\n"
+    "ld1 { v10.4s }, [x7]\n"
+    "ldr q11, [x7, x25]\n"
+    "ldr q12, [x14, x11]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v26.16b, v14.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+    "mov v28.16b, v14.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+    "add x6, x6, #0x10\n"
+    "cmp x6, x13, LSL #4\n"
+    "mov v16.16b, v14.16b\n fmla v16.4s, v3.4s, v9.4s\n"
+    "mov v22.16b, v14.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+    "add x20, x20, #0x10\n"
+    "add x21, x21, #0x10\n"
+    "mov v23.16b, v14.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+    "fmla v26.4s, v5.4s, v12.4s\n"
+    "mov v25.16b, v14.16b\n fmla v25.4s, v7.4s, v9.4s\n"
+    "mov v17.16b, v14.16b\n fmla v17.4s, v6.4s, v9.4s\n"
+    "mov v31.16b, v14.16b\n fmla v31.4s, v5.4s, v9.4s\n"
+    "mov v20.16b, v14.16b\n fmla v20.4s, v2.4s, v9.4s\n"
+    "ldr q9, [x12, x17]\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "ld1 { v30.4s }, [x26]\n"
+    "mov v29.16b, v14.16b\n fmla v29.4s, v2.4s, v11.4s\n"
+    "ldr q27, [x26, x25]\n"
+    "fmla v16.4s, v4.4s, v12.4s\n"
+    "fmla v22.4s, v2.4s, v12.4s\n"
+    "fmla v23.4s, v1.4s, v12.4s\n"
+    "mov v21.16b, v14.16b\n fmla v21.4s, v6.4s, v30.4s\n"
+    "ldr q10, [x12, x11]\n"
+    "fmla v26.4s, v7.4s, v9.4s\n"
+    "fmla v25.4s, v8.4s, v12.4s\n"
+    "fmla v17.4s, v7.4s, v12.4s\n"
+    "fmla v29.4s, v6.4s, v12.4s\n"
+    "mov v24.16b, v14.16b\n fmla v24.4s, v3.4s, v12.4s\n"
+    "mov v19.16b, v14.16b\n fmla v19.4s, v0.4s, v12.4s\n"
+    "ldr q11, [x7, x4]\n"
+    "mov v30.16b, v14.16b\n fmla v30.4s, v8.4s, v27.4s\n"
+    "ldr q12, [x7, x28]\n"
+    "fmla v16.4s, v6.4s, v9.4s\n"
+    "fmla v22.4s, v4.4s, v9.4s\n"
+    "fmla v23.4s, v3.4s, v9.4s\n"
+    "mov v27.16b, v14.16b\n fmla v27.4s, v1.4s, v9.4s\n"
+    "mov v18.16b, v14.16b\n fmla v18.4s, v0.4s, v9.4s\n"
+    "ldr q14, [x16, #0x0]\n"
+    "fmla v31.4s, v8.4s, v9.4s\n"
+    "fmla v20.4s, v5.4s, v9.4s\n"
+    "fmla v21.4s, v2.4s, v9.4s\n"
+    "ld1 { v9.4s }, [x15]\n"
+    "fmla v26.4s, v8.4s, v10.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "fmla v25.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x15, x25]\n"
+    "fmla v17.4s, v2.4s, v12.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "ld1 { v12.4s }, [x9]\n"
+    "fmla v16.4s, v7.4s, v10.4s\n"
+    "fmla v24.4s, v6.4s, v10.4s\n"
+    "fmla v22.4s, v5.4s, v10.4s\n"
+    "fmla v23.4s, v4.4s, v10.4s\n"
+    "fmla v19.4s, v3.4s, v10.4s\n"
+    "fmla v27.4s, v2.4s, v10.4s\n"
+    "fmla v18.4s, v1.4s, v10.4s\n"
+    "fmla v30.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x15, x17]\n"
+    "fmla v31.4s, v0.4s, v9.4s\n"
+    "fmla v20.4s, v6.4s, v12.4s\n"
+    "fmla v21.4s, v3.4s, v12.4s\n"
+    "ldr q12, [x9, x25]\n"
+    "fmla v26.4s, v1.4s, v10.4s\n"
+    "fmla v28.4s, v3.4s, v9.4s\n"
+    "fmla v29.4s, v5.4s, v11.4s\n"
+    "fmla v24.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x15, x11]\n"
+    "fmla v25.4s, v4.4s, v10.4s\n"
+    "fmla v17.4s, v3.4s, v10.4s\n"
+    "fmla v16.4s, v0.4s, v10.4s\n"
+    "fmla v19.4s, v8.4s, v12.4s\n"
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "ldr q9, [x26, x4]\n"
+    "fmla v31.4s, v2.4s, v10.4s\n"
+    "fmla v26.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v5.4s, v10.4s\n"
+    "ldr q10, [x14, x4]\n"
+    "fmla v25.4s, v5.4s, v11.4s\n"
+    "fmla v17.4s, v4.4s, v11.4s\n"
+    "fmla v29.4s, v3.4s, v11.4s\n"
+    "fmla v16.4s, v1.4s, v11.4s\n"
+    "fmla v24.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x14, x28]\n"
+    "fmla v21.4s, v7.4s, v9.4s\n"
+    "fmla v27.4s, v6.4s, v9.4s\n"
+    "ldr q12, [x26, x28]\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "fmla v26.4s, v3.4s, v10.4s\n"
+    "fmla v20.4s, v1.4s, v10.4s\n"
+    "fmla v22.4s, v0.4s, v10.4s\n"
+    "fmla v28.4s, v7.4s, v10.4s\n"
+    "fmla v25.4s, v6.4s, v10.4s\n"
+    "ldr q10, [x7, x17]\n"
+    "fmla v18.4s, v8.4s, v12.4s\n"
+    "fmla v30.4s, v7.4s, v12.4s\n"
+    "ldr q9, [x12, x4]\n"
+    "fmla v17.4s, v8.4s, v11.4s\n"
+    "fmla v29.4s, v7.4s, v11.4s\n"
+    "fmla v16.4s, v5.4s, v11.4s\n"
+    "fmla v24.4s, v4.4s, v11.4s\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "fmla v19.4s, v1.4s, v11.4s\n"
+    "ldr q12, [x7, x11]\n"
+    "add x7, x7, #0x10\n"
+    "fmla v31.4s, v7.4s, v9.4s\n"
+    "fmla v26.4s, v6.4s, v9.4s\n"
+    "fmla v20.4s, v4.4s, v9.4s\n"
+    "fmla v22.4s, v3.4s, v9.4s\n"
+    "fmla v21.4s, v1.4s, v9.4s\n"
+    "fmla v27.4s, v0.4s, v9.4s\n"
+    "ldr q9, [x12, x28]\n"
+    "fmla v28.4s, v2.4s, v10.4s\n"
+    "fmla v25.4s, v1.4s, v10.4s\n"
+    "fmla v17.4s, v0.4s, v10.4s\n"
+    "ld1 { v10.4s }, [x14]\n"
+    "fmla v18.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v0.4s, v12.4s\n"
+    "fmla v31.4s, v3.4s, v10.4s\n"
+    "fmla v20.4s, v0.4s, v10.4s\n"
+    "fmla v16.4s, v8.4s, v9.4s\n"
+    "fmla v24.4s, v7.4s, v9.4s\n"
+    "fmla v23.4s, v5.4s, v9.4s\n"
+    "fmla v19.4s, v4.4s, v9.4s\n"
+    "fmla v30.4s, v1.4s, v9.4s\n"
+    "ldr q11, [x9, x17]\n"
+    "fmla v25.4s, v2.4s, v12.4s\n"
+    "fmla v17.4s, v1.4s, v12.4s\n"
+    "ldr q12, [x14, x25]\n"
+    "add x14, x14, #0x10\n"
+    "ldr q9, [x14, x17]\n"
+    "fmla v28.4s, v6.4s, v10.4s\n"
+    "ld1 { v10.4s }, [x12]\n"
+    "fmla v27.4s, v4.4s, v11.4s\n"
+    "fmla v18.4s, v3.4s, v11.4s\n"
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "fmla v24.4s, v5.4s, v12.4s\n"
+    "fmla v19.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x12, x25]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v31.4s, v6.4s, v10.4s\n"
+    "fmla v20.4s, v3.4s, v10.4s\n"
+    "fmla v21.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x26, x17]\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "fmla v27.4s, v7.4s, v10.4s\n"
+    "fmla v18.4s, v6.4s, v10.4s\n"
+    "fmla v20.4s, v8.4s, v11.4s\n"
+    "fmla v22.4s, v7.4s, v11.4s\n"
+    "fmla v23.4s, v6.4s, v11.4s\n"
+    "fmla v21.4s, v5.4s, v11.4s\n"
+    "ldr q11, [x9, x11]\n"
+    "fmla v19.4s, v5.4s, v12.4s\n"
+    "fmla v27.4s, v5.4s, v11.4s\n"
+    "fmla v18.4s, v4.4s, v11.4s\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "fmla v24.4s, v8.4s, v12.4s\n"
+    "ldr q12, [x26, x11]\n"
+    "fmla v21.4s, v8.4s, v10.4s\n"
+    "ldr q10, [x15, x4]\n"
+    "fmla v22.4s, v8.4s, v11.4s\n"
+    "fmla v23.4s, v7.4s, v11.4s\n"
+    "add x26, x26, #0x10\n"
+    "fmla v19.4s, v6.4s, v11.4s\n"
+    "ldr q11, [x15, x28]\n"
+    "fmla v27.4s, v8.4s, v12.4s\n"
+    "add x15, x15, #0x10\n"
+    "fmla v18.4s, v7.4s, v12.4s\n"
+    "fmla v30.4s, v6.4s, v12.4s\n"
+    "ldr q12, [x9, x4]\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "fmla v25.4s, v3.4s, v10.4s\n"
+    "fmax v28.4s, v28.4s, v13.4s\n"
+    "fmla v31.4s, v1.4s, v10.4s\n"
+    "fmla v26.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x9, x28]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "fmla v17.4s, v5.4s, v11.4s\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "fmax v25.4s, v25.4s, v13.4s\n"
+    "add x9, x9, #0x10\n"
+    "fmla v16.4s, v2.4s, v11.4s\n"
+    "ldr q2, [x16, #0x30]\n"
+    "fmla v24.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x7, x25]\n"
+    "ldr q1, [x16, #0x20]\n"
+    "fmla v20.4s, v7.4s, v12.4s\n"
+    "fmla v22.4s, v6.4s, v12.4s\n"
+    "ldr q6, [x16, #0x70]\n"
+    "fmla v21.4s, v4.4s, v12.4s\n"
+    "fmla v27.4s, v3.4s, v12.4s\n"
+    "ldr q12, [x14, x11]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "fmla v23.4s, v8.4s, v10.4s\n"
+    "ldr q8, [x16, #0x90]\n"
+    "fmla v19.4s, v7.4s, v10.4s\n"
+    "ldr q7, [x16, #0x80]\n"
+    "fmla v18.4s, v5.4s, v10.4s\n"
+    "ldr q5, [x16, #0x60]\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "ld1 { v10.4s }, [x7]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "fmax v17.4s, v17.4s, v13.4s\n"
+    "fmax v29.4s, v29.4s, v13.4s\n"
+    "add x16, x16, #0xa0\n"
+    "fmax v31.4s, v31.4s, v13.4s\n"
+    "fmax v26.4s, v26.4s, v13.4s\n"
+    "fmax v16.4s, v16.4s, v13.4s\n"
+    "fmax v24.4s, v24.4s, v13.4s\n"
+    "fmax v20.4s, v20.4s, v13.4s\n"
+    "fmax v22.4s, v22.4s, v13.4s\n"
+    "fmax v23.4s, v23.4s, v13.4s\n"
+    "fmax v19.4s, v19.4s, v13.4s\n"
+    "fmax v21.4s, v21.4s, v13.4s\n"
+    "fmax v27.4s, v27.4s, v13.4s\n"
+    "fmax v18.4s, v18.4s, v13.4s\n"
+    "fmax v30.4s, v30.4s, v13.4s\n"
+    "fmin v28.4s, v28.4s, v15.4s\n"
+    "fmin v25.4s, v25.4s, v15.4s\n"
+    "st1 { v28.4s }, [x8]\n"
+    "fmin v17.4s, v17.4s, v15.4s\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "str q25, [x8, x5]\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "fmin v26.4s, v26.4s, v15.4s\n"
+    "str q17, [x8, x23]\n"
+    "fmin v16.4s, v16.4s, v15.4s\n"
+    "fmin v24.4s, v24.4s, v15.4s\n"
+    "str q29, [x8, x22]\n"
+    "add x8, x8, #0x10\n"
+    "fmin v20.4s, v20.4s, v15.4s\n"
+    "fmin v22.4s, v22.4s, v15.4s\n"
+    "st1 { v31.4s }, [x10]\n"
+    "fmin v23.4s, v23.4s, v15.4s\n"
+    "fmin v19.4s, v19.4s, v15.4s\n"
+    "str q26, [x10, x5]\n"
+    "fmin v21.4s, v21.4s, v15.4s\n"
+    "fmin v27.4s, v27.4s, v15.4s\n"
+    "str q16, [x10, x23]\n"
+    "fmin v18.4s, v18.4s, v15.4s\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "str q24, [x10, x22]\n"
+    "add x10, x10, #0x10\n"
+    "st1 { v20.4s }, [x27]\n"
+    "str q22, [x27, x5]\n"
+    "str q23, [x27, x23]\n"
+    "str q19, [x27, x22]\n"
+    "add x27, x27, #0x10\n"
+    "st1 { v21.4s }, [x24]\n"
+    "str q27, [x24, x5]\n"
+    "str q18, [x24, x23]\n"
+    "str q30, [x24, x22]\n"
+    "add x24, x24, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v16.16b, v14.16b\n fmla v16.4s, v4.4s, v9.4s\n"
+    "mov v23.16b, v14.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+    "mov v31.16b, v14.16b\n fmla v31.4s, v3.4s, v9.4s\n"
+    "mov v30.16b, v14.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+    "mov v18.16b, v14.16b\n fmla v18.4s, v0.4s, v9.4s\n"
+    "fmla v16.4s, v5.4s, v12.4s\n"
+    "mov v17.16b, v14.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+    "mov v19.16b, v14.16b\n fmla v19.4s, v6.4s, v9.4s\n"
+    "mov v28.16b, v14.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+    "mov v27.16b, v14.16b\n fmla v27.4s, v2.4s, v9.4s\n"
+    "ldr q24, [x12, x17]\n"
+    "fmla v23.4s, v0.4s, v10.4s\n"
+    "ld1 { v21.4s }, [x26]\n"
+    "mov v29.16b, v14.16b\n fmla v29.4s, v2.4s, v11.4s\n"
+    "ldr q20, [x26, x25]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "fmla v18.4s, v1.4s, v12.4s\n"
+    "mov v26.16b, v14.16b\n fmla v26.4s, v6.4s, v21.4s\n"
+    "ldr q9, [x12, x11]\n"
+    "fmla v16.4s, v7.4s, v24.4s\n"
+    "fmla v17.4s, v8.4s, v12.4s\n"
+    "fmla v19.4s, v7.4s, v12.4s\n"
+    "fmla v29.4s, v6.4s, v12.4s\n"
+    "mov v11.16b, v14.16b\n fmla v11.4s, v3.4s, v12.4s\n"
+    "mov v10.16b, v14.16b\n fmla v10.4s, v0.4s, v12.4s\n"
+    "ldr q22, [x7, x4]\n"
+    "mov v25.16b, v14.16b\n fmla v25.4s, v8.4s, v20.4s\n"
+    "ldr q21, [x7, x28]\n"
+    "fmla v31.4s, v6.4s, v24.4s\n"
+    "fmla v30.4s, v4.4s, v24.4s\n"
+    "fmla v18.4s, v3.4s, v24.4s\n"
+    "mov v12.16b, v14.16b\n fmla v12.4s, v1.4s, v24.4s\n"
+    "fmla v14.4s, v0.4s, v24.4s\n"
+    "fmla v28.4s, v8.4s, v24.4s\n"
+    "fmla v27.4s, v5.4s, v24.4s\n"
+    "fmla v26.4s, v2.4s, v24.4s\n"
+    "ld1 { v24.4s }, [x15]\n"
+    "fmla v16.4s, v8.4s, v9.4s\n"
+    "fmla v23.4s, v1.4s, v22.4s\n"
+    "fmla v17.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x15, x25]\n"
+    "fmla v19.4s, v2.4s, v21.4s\n"
+    "fmla v29.4s, v1.4s, v21.4s\n"
+    "ld1 { v20.4s }, [x9]\n"
+    "fmla v31.4s, v7.4s, v9.4s\n"
+    "fmla v11.4s, v6.4s, v9.4s\n"
+    "fmla v30.4s, v5.4s, v9.4s\n"
+    "fmla v18.4s, v4.4s, v9.4s\n"
+    "fmla v10.4s, v3.4s, v9.4s\n"
+    "fmla v12.4s, v2.4s, v9.4s\n"
+    "fmla v14.4s, v1.4s, v9.4s\n"
+    "fmla v25.4s, v0.4s, v9.4s\n"
+    "ldr q21, [x15, x17]\n"
+    "fmla v28.4s, v0.4s, v24.4s\n"
+    "fmla v27.4s, v6.4s, v20.4s\n"
+    "fmla v26.4s, v3.4s, v20.4s\n"
+    "ldr q20, [x9, x25]\n"
+    "fmla v16.4s, v1.4s, v21.4s\n"
+    "fmla v23.4s, v3.4s, v24.4s\n"
+    "fmla v29.4s, v5.4s, v22.4s\n"
+    "fmla v11.4s, v2.4s, v22.4s\n"
+    "ldr q22, [x15, x11]\n"
+    "fmla v17.4s, v4.4s, v21.4s\n"
+    "fmla v19.4s, v3.4s, v21.4s\n"
+    "fmla v31.4s, v0.4s, v21.4s\n"
+    "fmla v10.4s, v8.4s, v20.4s\n"
+    "fmla v25.4s, v5.4s, v20.4s\n"
+    "ldr q20, [x26, x4]\n"
+    "fmla v28.4s, v2.4s, v21.4s\n"
+    "fmla v16.4s, v2.4s, v22.4s\n"
+    "fmla v23.4s, v5.4s, v21.4s\n"
+    "ldr q21, [x14, x4]\n"
+    "fmla v17.4s, v5.4s, v22.4s\n"
+    "fmla v19.4s, v4.4s, v22.4s\n"
+    "fmla v29.4s, v3.4s, v22.4s\n"
+    "fmla v31.4s, v1.4s, v22.4s\n"
+    "fmla v11.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x14, x28]\n"
+    "fmla v26.4s, v7.4s, v20.4s\n"
+    "fmla v12.4s, v6.4s, v20.4s\n"
+    "ldr q20, [x26, x28]\n"
+    "fmla v28.4s, v4.4s, v21.4s\n"
+    "fmla v16.4s, v3.4s, v21.4s\n"
+    "fmla v27.4s, v1.4s, v21.4s\n"
+    "fmla v30.4s, v0.4s, v21.4s\n"
+    "fmla v23.4s, v7.4s, v21.4s\n"
+    "fmla v17.4s, v6.4s, v21.4s\n"
+    "ldr q21, [x7, x17]\n"
+    "fmla v14.4s, v8.4s, v20.4s\n"
+    "fmla v25.4s, v7.4s, v20.4s\n"
+    "ldr q20, [x12, x4]\n"
+    "fmla v19.4s, v8.4s, v22.4s\n"
+    "fmla v29.4s, v7.4s, v22.4s\n"
+    "fmla v31.4s, v5.4s, v22.4s\n"
+    "fmla v11.4s, v4.4s, v22.4s\n"
+    "fmla v18.4s, v2.4s, v22.4s\n"
+    "fmla v10.4s, v1.4s, v22.4s\n"
+    "ldr q22, [x7, x11]\n"
+    "add x7, x7, #0x10\n"
+    "fmla v28.4s, v7.4s, v20.4s\n"
+    "fmla v16.4s, v6.4s, v20.4s\n"
+    "fmla v27.4s, v4.4s, v20.4s\n"
+    "fmla v30.4s, v3.4s, v20.4s\n"
+    "fmla v26.4s, v1.4s, v20.4s\n"
+    "fmla v12.4s, v0.4s, v20.4s\n"
+    "ldr q20, [x12, x28]\n"
+    "fmla v23.4s, v2.4s, v21.4s\n"
+    "fmla v17.4s, v1.4s, v21.4s\n"
+    "fmla v19.4s, v0.4s, v21.4s\n"
+    "ld1 { v21.4s }, [x14]\n"
+    "fmla v14.4s, v2.4s, v20.4s\n"
+    "fmla v29.4s, v0.4s, v22.4s\n"
+    "fmla v28.4s, v3.4s, v21.4s\n"
+    "fmla v27.4s, v0.4s, v21.4s\n"
+    "fmla v31.4s, v8.4s, v20.4s\n"
+    "fmla v11.4s, v7.4s, v20.4s\n"
+    "fmla v18.4s, v5.4s, v20.4s\n"
+    "fmla v10.4s, v4.4s, v20.4s\n"
+    "fmla v25.4s, v1.4s, v20.4s\n"
+    "ldr q24, [x9, x17]\n"
+    "fmla v17.4s, v2.4s, v22.4s\n"
+    "fmla v19.4s, v1.4s, v22.4s\n"
+    "ldr q20, [x14, x25]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v23.4s, v6.4s, v21.4s\n"
+    "ld1 { v21.4s }, [x12]\n"
+    "fmla v12.4s, v4.4s, v24.4s\n"
+    "fmla v14.4s, v3.4s, v24.4s\n"
+    "fmla v29.4s, v8.4s, v20.4s\n"
+    "fmla v11.4s, v5.4s, v20.4s\n"
+    "fmla v10.4s, v2.4s, v20.4s\n"
+    "ldr q20, [x12, x25]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v28.4s, v6.4s, v21.4s\n"
+    "fmla v27.4s, v3.4s, v21.4s\n"
+    "fmla v26.4s, v0.4s, v21.4s\n"
+    "ldr q22, [x26, x17]\n"
+    "fmla v25.4s, v2.4s, v20.4s\n"
+    "fmla v12.4s, v7.4s, v22.4s\n"
+    "fmla v14.4s, v6.4s, v22.4s\n"
+    "fmla v27.4s, v8.4s, v24.4s\n"
+    "fmla v30.4s, v7.4s, v24.4s\n"
+    "fmla v18.4s, v6.4s, v24.4s\n"
+    "fmla v26.4s, v5.4s, v24.4s\n"
+    "ldr q21, [x9, x11]\n"
+    "fmla v10.4s, v5.4s, v20.4s\n"
+    "fmla v12.4s, v5.4s, v21.4s\n"
+    "fmla v14.4s, v4.4s, v21.4s\n"
+    "fmla v25.4s, v3.4s, v21.4s\n"
+    "fmla v11.4s, v8.4s, v20.4s\n"
+    "ldr q20, [x26, x11]\n"
+    "fmla v26.4s, v8.4s, v22.4s\n"
+    "ldr q9, [x15, x4]\n"
+    "fmla v30.4s, v8.4s, v21.4s\n"
+    "fmla v18.4s, v7.4s, v21.4s\n"
+    "add x26, x26, #0x10\n"
+    "fmla v10.4s, v6.4s, v21.4s\n"
+    "ldr q21, [x15, x28]\n"
+    "fmla v12.4s, v8.4s, v20.4s\n"
+    "add x15, x15, #0x10\n"
+    "fmla v14.4s, v7.4s, v20.4s\n"
+    "fmla v25.4s, v6.4s, v20.4s\n"
+    "ldr q24, [x9, x4]\n"
+    "fmla v23.4s, v4.4s, v9.4s\n"
+    "fmla v17.4s, v3.4s, v9.4s\n"
+    "fmax v23.4s, v23.4s, v13.4s\n"
+    "fmla v28.4s, v1.4s, v9.4s\n"
+    "fmla v16.4s, v0.4s, v9.4s\n"
+    "ldr q0, [x9, x28]\n"
+    "fmax v17.4s, v17.4s, v13.4s\n"
+    "fmla v19.4s, v5.4s, v21.4s\n"
+    "fmla v29.4s, v4.4s, v21.4s\n"
+    "fmax v19.4s, v19.4s, v13.4s\n"
+    "add x9, x9, #0x10\n"
+    "fmla v31.4s, v2.4s, v21.4s\n"
+    "fmla v11.4s, v1.4s, v21.4s\n"
+    "fmax v29.4s, v29.4s, v13.4s\n"
+    "fmla v27.4s, v7.4s, v24.4s\n"
+    "fmla v30.4s, v6.4s, v24.4s\n"
+    "fmax v28.4s, v28.4s, v13.4s\n"
+    "fmla v26.4s, v4.4s, v24.4s\n"
+    "fmla v12.4s, v3.4s, v24.4s\n"
+    "fmax v16.4s, v16.4s, v13.4s\n"
+    "fmla v18.4s, v8.4s, v0.4s\n"
+    "fmla v10.4s, v7.4s, v0.4s\n"
+    "fmax v31.4s, v31.4s, v13.4s\n"
+    "fmla v14.4s, v5.4s, v0.4s\n"
+    "fmla v25.4s, v4.4s, v0.4s\n"
+    "fmax v11.4s, v11.4s, v13.4s\n"
+    "fmax v27.4s, v27.4s, v13.4s\n"
+    "fmax v30.4s, v30.4s, v13.4s\n"
+    "fmax v18.4s, v18.4s, v13.4s\n"
+    "fmax v10.4s, v10.4s, v13.4s\n"
+    "fmax v26.4s, v26.4s, v13.4s\n"
+    "fmax v12.4s, v12.4s, v13.4s\n"
+    "fmax v14.4s, v14.4s, v13.4s\n"
+    "fmax v25.4s, v25.4s, v13.4s\n"
+    "fmin v23.4s, v23.4s, v15.4s\n"
+    "fmin v17.4s, v17.4s, v15.4s\n"
+    "st1 { v23.4s }, [x8]\n"
+    "fmin v19.4s, v19.4s, v15.4s\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "str q17, [x8, x5]\n"
+    "fmin v28.4s, v28.4s, v15.4s\n"
+    "fmin v16.4s, v16.4s, v15.4s\n"
+    "str q19, [x8, x23]\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "fmin v11.4s, v11.4s, v15.4s\n"
+    "str q29, [x8, x22]\n"
+    "add x8, x8, #0x10\n"
+    "fmin v27.4s, v27.4s, v15.4s\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "st1 { v28.4s }, [x10]\n"
+    "fmin v18.4s, v18.4s, v15.4s\n"
+    "fmin v10.4s, v10.4s, v15.4s\n"
+    "str q16, [x10, x5]\n"
+    "fmin v26.4s, v26.4s, v15.4s\n"
+    "fmin v12.4s, v12.4s, v15.4s\n"
+    "str q31, [x10, x23]\n"
+    "fmin v14.4s, v14.4s, v15.4s\n"
+    "fmin v25.4s, v25.4s, v15.4s\n"
+    "str q11, [x10, x22]\n"
+    "add x10, x10, #0x10\n"
+    "st1 { v27.4s }, [x27]\n"
+    "str q30, [x27, x5]\n"
+    "str q18, [x27, x23]\n"
+    "str q10, [x27, x22]\n"
+    "add x27, x27, #0x10\n"
+    "st1 { v26.4s }, [x24]\n"
+    "str q12, [x24, x5]\n"
+    "str q14, [x24, x23]\n"
+    "str q25, [x24, x22]\n"
+    "add x24, x24, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 73f\n"
+    "ldr q14, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "add x23, x14, x17\n"
+    "add x22, x7, XZR\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "add x21, x7, x25\n"
+    "add x20, x14, x11\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "ldr q5, [x16, #0x60]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "ldr q7, [x16, #0x80]\n"
+    "ldr q8, [x16, #0x90]\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr d9, [x23], #0x8\n"
+    "ldr d10, [x22], #0x8\n"
+    "ldr d11, [x21], #0x8\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v9.s }[2], [x23]\n"
+    "ld1 { v10.s }[2], [x22]\n"
+    "ld1 { v11.s }[2], [x21]\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
+    "ldr s9, [x23, #0x0]\n"
+    "ldr s10, [x22, #0x0]\n"
+    "ldr s11, [x21, #0x0]\n"
+    "ldr s12, [x20, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
+    "mov v16.16b, v14.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+    "mov v17.16b, v14.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+    "add x20, x26, XZR\n"
+    "mov v18.16b, v14.16b\n fmla v18.4s, v6.4s, v9.4s\n"
+    "mov v21.16b, v14.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+    "mov v22.16b, v14.16b\n fmla v22.4s, v3.4s, v9.4s\n"
+    "mov v25.16b, v14.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+    "mov v26.16b, v14.16b\n fmla v26.4s, v0.4s, v9.4s\n"
+    "mov v19.16b, v14.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+    "mov v20.16b, v14.16b\n fmla v20.4s, v5.4s, v9.4s\n"
+    "mov v24.16b, v14.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+    "fmla v16.4s, v0.4s, v10.4s\n"
+    "fmla v17.4s, v8.4s, v12.4s\n"
+    "fmla v18.4s, v7.4s, v12.4s\n"
+    "fmla v19.4s, v6.4s, v12.4s\n"
+    "fmla v21.4s, v5.4s, v12.4s\n"
+    "fmla v22.4s, v4.4s, v12.4s\n"
+    "mov v23.16b, v14.16b\n fmla v23.4s, v3.4s, v12.4s\n"
+    "fmla v25.4s, v2.4s, v12.4s\n"
+    "fmla v26.4s, v1.4s, v12.4s\n"
+    "mov v27.16b, v14.16b\n fmla v27.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
+    "mov v28.16b, v14.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+    "add x20, x26, x25\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
+    "mov v31.16b, v14.16b\n fmla v31.4s, v8.4s, v11.4s\n"
+    "add x20, x12, x17\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v20.4s, v8.4s, v9.4s\n"
+    "fmla v21.4s, v7.4s, v9.4s\n"
+    "add x20, x7, x4\n"
+    "fmla v22.4s, v6.4s, v9.4s\n"
+    "fmla v24.4s, v5.4s, v9.4s\n"
+    "fmla v25.4s, v4.4s, v9.4s\n"
+    "fmla v26.4s, v3.4s, v9.4s\n"
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "mov v29.16b, v14.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+    "mov v30.16b, v14.16b\n fmla v30.4s, v0.4s, v9.4s\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+    "fmla v16.4s, v1.4s, v12.4s\n"
+    "fmla v17.4s, v0.4s, v12.4s\n"
+    "add x20, x7, x28\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: End
+    "fmla v18.4s, v2.4s, v11.4s\n"
+    "fmla v19.4s, v1.4s, v11.4s\n"
+    "add x20, x12, x11\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v21.4s, v8.4s, v10.4s\n"
+    "fmla v22.4s, v7.4s, v10.4s\n"
+    "add x20, x15, XZR\n"
+    "fmla v23.4s, v6.4s, v10.4s\n"
+    "fmla v25.4s, v5.4s, v10.4s\n"
+    "fmla v26.4s, v4.4s, v10.4s\n"
+    "fmla v27.4s, v3.4s, v10.4s\n"
+    "fmla v29.4s, v2.4s, v10.4s\n"
+    "fmla v30.4s, v1.4s, v10.4s\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+    "fmla v16.4s, v3.4s, v9.4s\n"
+    "fmla v20.4s, v0.4s, v9.4s\n"
+    "add x20, x15, x25\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: End
+    "fmla v19.4s, v5.4s, v12.4s\n"
+    "fmla v23.4s, v2.4s, v12.4s\n"
+    "add x20, x9, XZR\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+    "fmla v24.4s, v6.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v11.4s\n"
+    "add x20, x15, x17\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
+    "fmla v16.4s, v5.4s, v10.4s\n"
+    "fmla v17.4s, v4.4s, v10.4s\n"
+    "add x20, x9, x25\n"
+    "fmla v18.4s, v3.4s, v10.4s\n"
+    "fmla v20.4s, v2.4s, v10.4s\n"
+    "fmla v21.4s, v1.4s, v10.4s\n"
+    "fmla v22.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
+    "fmla v27.4s, v8.4s, v11.4s\n"
+    "fmla v31.4s, v5.4s, v11.4s\n"
+    "add x20, x15, x11\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v17.4s, v5.4s, v12.4s\n"
+    "fmla v18.4s, v4.4s, v12.4s\n"
+    "add x20, x26, x4\n"
+    "fmla v19.4s, v3.4s, v12.4s\n"
+    "fmla v21.4s, v2.4s, v12.4s\n"
+    "fmla v22.4s, v1.4s, v12.4s\n"
+    "fmla v23.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
+    "fmla v28.4s, v7.4s, v11.4s\n"
+    "fmla v29.4s, v6.4s, v11.4s\n"
+    "add x20, x14, x4\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 34f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 34f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+    "fmla v16.4s, v7.4s, v10.4s\n"
+    "fmla v17.4s, v6.4s, v10.4s\n"
+    "add x20, x26, x28\n"
+    "fmla v20.4s, v4.4s, v10.4s\n"
+    "fmla v21.4s, v3.4s, v10.4s\n"
+    "fmla v24.4s, v1.4s, v10.4s\n"
+    "fmla v25.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
+    "fmla v30.4s, v8.4s, v11.4s\n"
+    "fmla v31.4s, v7.4s, v11.4s\n"
+    "add x20, x14, x28\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 38f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 38f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+    "fmla v18.4s, v8.4s, v12.4s\n"
+    "fmla v19.4s, v7.4s, v12.4s\n"
+    "add x20, x7, x17\n"
+    "fmla v22.4s, v5.4s, v12.4s\n"
+    "fmla v23.4s, v4.4s, v12.4s\n"
+    "fmla v26.4s, v2.4s, v12.4s\n"
+    "fmla v27.4s, v1.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+    "fmla v16.4s, v2.4s, v10.4s\n"
+    "fmla v17.4s, v1.4s, v10.4s\n"
+    "add x20, x12, x4\n"
+    "fmla v18.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 42f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 42f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v20.4s, v7.4s, v11.4s\n"
+    "fmla v21.4s, v6.4s, v11.4s\n"
+    "add x20, x7, x11\n"
+    "fmla v24.4s, v4.4s, v11.4s\n"
+    "fmla v25.4s, v3.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
+    "fmla v17.4s, v2.4s, v12.4s\n"
+    "fmla v18.4s, v1.4s, v12.4s\n"
+    "add x20, x14, XZR\n"
+    "fmla v19.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 46f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 46f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+    "fmla v16.4s, v6.4s, v10.4s\n"
+    "fmla v20.4s, v3.4s, v10.4s\n"
+    "add x20, x12, x28\n"
+    "fmla v24.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+    "fmla v22.4s, v8.4s, v11.4s\n"
+    "fmla v23.4s, v7.4s, v11.4s\n"
+    "add x20, x14, x25\n"
+    "fmla v26.4s, v5.4s, v11.4s\n"
+    "fmla v27.4s, v4.4s, v11.4s\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v31.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 50f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 50f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
+    "fmla v19.4s, v8.4s, v12.4s\n"
+    "fmla v23.4s, v5.4s, v12.4s\n"
+    "add x20, x12, XZR\n"
+    "fmla v27.4s, v2.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v20.4s, v6.4s, v10.4s\n"
+    "fmla v24.4s, v3.4s, v10.4s\n"
+    "add x20, x9, x17\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 54f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 54f\n"
+    "53:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "54:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+    "fmla v24.4s, v8.4s, v11.4s\n"
+    "fmla v25.4s, v7.4s, v11.4s\n"
+    "add x20, x12, x25\n"
+    "fmla v26.4s, v6.4s, v11.4s\n"
+    "fmla v28.4s, v5.4s, v11.4s\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 55f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "56:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
+    "fmla v23.4s, v8.4s, v12.4s\n"
+    "fmla v27.4s, v5.4s, v12.4s\n"
+    "add x20, x26, x17\n"
+    "fmla v31.4s, v2.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 57f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 58f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 58f\n"
+    "57:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "58:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
+    "fmla v28.4s, v8.4s, v10.4s\n"
+    "fmla v29.4s, v7.4s, v10.4s\n"
+    "add x20, x9, x11\n"
+    "fmla v30.4s, v6.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 59f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 60f\n"
+    "59:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "60:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+    "fmla v25.4s, v8.4s, v11.4s\n"
+    "fmla v26.4s, v7.4s, v11.4s\n"
+    "add x20, x26, x11\n"
+    "fmla v27.4s, v6.4s, v11.4s\n"
+    "fmla v29.4s, v5.4s, v11.4s\n"
+    "fmla v30.4s, v4.4s, v11.4s\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 61f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 62f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 62f\n"
+    "61:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "62:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "fmla v30.4s, v7.4s, v12.4s\n"
+    "add x20, x15, x4\n"
+    "fmla v31.4s, v6.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 63f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 64f\n"
+    "63:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "64:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
+    "fmla v16.4s, v4.4s, v10.4s\n"
+    "fmla v17.4s, v3.4s, v10.4s\n"
+    "add x20, x15, x28\n"
+    "fmla v20.4s, v1.4s, v10.4s\n"
+    "fmla v21.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 65f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 66f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 66f\n"
+    "65:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "66:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+    "fmla v18.4s, v5.4s, v11.4s\n"
+    "fmla v19.4s, v4.4s, v11.4s\n"
+    "add x20, x9, x4\n"
+    "fmla v22.4s, v2.4s, v11.4s\n"
+    "fmla v23.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 67f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 68f\n"
+    "67:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "68:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+    "fmla v24.4s, v7.4s, v12.4s\n"
+    "fmla v25.4s, v6.4s, v12.4s\n"
+    "add x20, x9, x28\n"
+    "fmla v28.4s, v4.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 69f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 70f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 70f\n"
+    "69:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "70:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+    "fmla v26.4s, v8.4s, v10.4s\n"
+    "fmla v27.4s, v7.4s, v10.4s\n"
+    "fmax v16.4s, v16.4s, v13.4s\n"
+    "fmla v30.4s, v5.4s, v10.4s\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "fmax v17.4s, v17.4s, v13.4s\n"
+    "fmax v18.4s, v18.4s, v13.4s\n"
+    "fmax v19.4s, v19.4s, v13.4s\n"
+    "fmax v20.4s, v20.4s, v13.4s\n"
+    "fmax v21.4s, v21.4s, v13.4s\n"
+    "fmax v22.4s, v22.4s, v13.4s\n"
+    "fmax v23.4s, v23.4s, v13.4s\n"
+    "fmax v24.4s, v24.4s, v13.4s\n"
+    "fmax v25.4s, v25.4s, v13.4s\n"
+    "fmax v26.4s, v26.4s, v13.4s\n"
+    "fmax v27.4s, v27.4s, v13.4s\n"
+    "fmax v28.4s, v28.4s, v13.4s\n"
+    "fmax v29.4s, v29.4s, v13.4s\n"
+    "fmax v30.4s, v30.4s, v13.4s\n"
+    "fmax v31.4s, v31.4s, v13.4s\n"
+    "fmin v16.4s, v16.4s, v15.4s\n"
+    "fmin v17.4s, v17.4s, v15.4s\n"
+    "fmin v18.4s, v18.4s, v15.4s\n"
+    "fmin v19.4s, v19.4s, v15.4s\n"
+    "fmin v20.4s, v20.4s, v15.4s\n"
+    "fmin v21.4s, v21.4s, v15.4s\n"
+    "fmin v22.4s, v22.4s, v15.4s\n"
+    "fmin v23.4s, v23.4s, v15.4s\n"
+    "fmin v24.4s, v24.4s, v15.4s\n"
+    "fmin v25.4s, v25.4s, v15.4s\n"
+    "fmin v26.4s, v26.4s, v15.4s\n"
+    "fmin v27.4s, v27.4s, v15.4s\n"
+    "fmin v28.4s, v28.4s, v15.4s\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "tbz %x[n_channels], #1, 71f\n"
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.d }[0], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.d }[0], [x22], x5\n"
+    "st1 { v24.d }[0], [x21], x5\n"
+    "add x8, x8, #0x8\n"
+    "add x10, x10, #0x8\n"
+    "st1 { v28.d }[0], [x20], x5\n"
+    "add x27, x27, #0x8\n"
+    "add x24, x24, #0x8\n"
+    "st1 { v17.d }[0], [x23], x5\n"
+    "st1 { v21.d }[0], [x22], x5\n"
+    "st1 { v25.d }[0], [x21], x5\n"
+    "st1 { v29.d }[0], [x20], x5\n"
+    "st1 { v18.d }[0], [x23], x5\n"
+    "st1 { v22.d }[0], [x22], x5\n"
+    "st1 { v26.d }[0], [x21], x5\n"
+    "st1 { v30.d }[0], [x20], x5\n"
+    "st1 { v19.d }[0], [x23]\n"
+    "st1 { v23.d }[0], [x22]\n"
+    "st1 { v27.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.s }[2], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.s }[2], [x22], x5\n"
+    "st1 { v24.s }[2], [x21], x5\n"
+    "st1 { v28.s }[2], [x20], x5\n"
+    "st1 { v17.s }[2], [x23], x5\n"
+    "st1 { v21.s }[2], [x22], x5\n"
+    "st1 { v25.s }[2], [x21], x5\n"
+    "st1 { v29.s }[2], [x20], x5\n"
+    "st1 { v18.s }[2], [x23], x5\n"
+    "st1 { v22.s }[2], [x22], x5\n"
+    "st1 { v26.s }[2], [x21], x5\n"
+    "st1 { v30.s }[2], [x20], x5\n"
+    "st1 { v19.s }[2], [x23]\n"
+    "st1 { v23.s }[2], [x22]\n"
+    "st1 { v27.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "b 72f\n"
+    "71:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.s }[0], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.s }[0], [x22], x5\n"
+    "st1 { v24.s }[0], [x21], x5\n"
+    "st1 { v28.s }[0], [x20], x5\n"
+    "st1 { v17.s }[0], [x23], x5\n"
+    "st1 { v21.s }[0], [x22], x5\n"
+    "st1 { v25.s }[0], [x21], x5\n"
+    "st1 { v29.s }[0], [x20], x5\n"
+    "st1 { v18.s }[0], [x23], x5\n"
+    "st1 { v22.s }[0], [x22], x5\n"
+    "st1 { v26.s }[0], [x21], x5\n"
+    "st1 { v30.s }[0], [x20], x5\n"
+    "st1 { v19.s }[0], [x23]\n"
+    "st1 { v23.s }[0], [x22]\n"
+    "st1 { v27.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "72:"  // Tile loop: Oddments: Store: Bit 1: End
+    "73:"  // Tile loop: End
+    "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x26, x26, #0x1\n"
+    "add x21, x27, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x26, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x27, x27, x21, LT\n"
+    "csel x26, x26, XZR, LT\n"
+    "cmp x27, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..76045f30d6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[36];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[14];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[5];
+      inptrs[3] = input_ptrs[15];
+      inptrs[4] = input_ptrs[30];
+      inptrs[5] = input_ptrs[35];
+      inptrs[6] = input_ptrs[20];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[4];
+      inptrs[9] = input_ptrs[21];
+      inptrs[10] = input_ptrs[6];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[24];
+      inptrs[13] = input_ptrs[8];
+      inptrs[14] = input_ptrs[29];
+      inptrs[15] = input_ptrs[9];
+      inptrs[16] = input_ptrs[31];
+      inptrs[17] = input_ptrs[13];
+      inptrs[18] = input_ptrs[34];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[19];
+      inptrs[22] = input_ptrs[3];
+      inptrs[23] = input_ptrs[12];
+      inptrs[24] = input_ptrs[22];
+      inptrs[25] = input_ptrs[17];
+      inptrs[26] = input_ptrs[18];
+      inptrs[27] = input_ptrs[26];
+      inptrs[28] = input_ptrs[23];
+      inptrs[29] = input_ptrs[32];
+      inptrs[30] = input_ptrs[27];
+      inptrs[31] = input_ptrs[33];
+      inptrs[32] = input_ptrs[7];
+      inptrs[33] = input_ptrs[10];
+      inptrs[34] = input_ptrs[25];
+      inptrs[35] = input_ptrs[28];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "mov x6, #0x10\n"  // cntb _, ALL, #1
+    "lsr x7, %x[n_channels], #0x2\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v13.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "mov x15, #0x0\n"
+    "sub x14, XZR, x6\n"
+    "cbz x7, 3f\n"
+    "ldr q30, [x17, #0x0]\n"
+    "ldr q0, [x17, #0x10]\n"
+    "cmp x6, x7, LSL #4\n"
+    "ldr q1, [x17, #0x20]\n"
+    "ldr q2, [x17, #0x30]\n"
+    "ldr q3, [x17, #0x40]\n"
+    "ldr q4, [x17, #0x50]\n"
+    "ldr q5, [x17, #0x60]\n"
+    "ldr q6, [x17, #0x70]\n"
+    "ldr q7, [x17, #0x80]\n"
+    "ldr q8, [x17, #0x90]\n"
+    "add x17, x17, #0xa0\n"
+    "ldp x21, x20, [x16, #0x0]\n"
+    "ldr q9, [x21, x15]\n"
+    "ldr q10, [x20, x15]\n"
+    "ldp x21, x20, [x16, #0x10]\n"
+    "ldr q11, [x21, x15]\n"
+    "ldr q12, [x20, x15]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v23.16b, v30.16b\n fmla v23.4s, v4.4s, v9.4s\n"
+    "mov v17.16b, v30.16b\n fmla v17.4s, v8.4s, v9.4s\n"
+    "ldr x27, [x16, #0x20]\n"
+    "ldr x24, [x16, #0x30]\n"
+    "mov v25.16b, v30.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+    "mov v28.16b, v30.16b\n fmla v28.4s, v1.4s, v9.4s\n"
+    "ldr x23, [x16, #0x28]\n"
+    "ldr x22, [x16, #0x38]\n"
+    "mov v20.16b, v30.16b\n fmla v20.4s, v0.4s, v9.4s\n"
+    "mov v16.16b, v30.16b\n fmla v16.4s, v7.4s, v9.4s\n"
+    "ldr x26, [x16, #0x40]\n"
+    "ldr x20, [x16, #0x48]\n"
+    "mov v15.16b, v30.16b\n fmla v15.4s, v6.4s, v9.4s\n"
+    "fmla v23.4s, v5.4s, v12.4s\n"
+    "ldr x25, [x16, #0x50]\n"
+    "ldr x21, [x16, #0x58]\n"
+    "mov v27.16b, v30.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+    "mov v31.16b, v30.16b\n fmla v31.4s, v2.4s, v9.4s\n"
+    "ldr q9, [x24, x15]\n"
+    "ldr x13, [x16, #0x70]\n"
+    "fmla v17.4s, v0.4s, v10.4s\n"
+    "ldr q22, [x27, x15]\n"
+    "mov v10.16b, v30.16b\n fmla v10.4s, v2.4s, v11.4s\n"
+    "ldr q18, [x23, x15]\n"
+    "fmla v25.4s, v4.4s, v12.4s\n"
+    "fmla v28.4s, v2.4s, v12.4s\n"
+    "ldr x24, [x16, #0x60]\n"
+    "ldr x23, [x16, #0x68]\n"
+    "fmla v20.4s, v1.4s, v12.4s\n"
+    "fmla v16.4s, v8.4s, v12.4s\n"
+    "ldr x12, [x8, #0x0]\n"
+    "ldr x11, [x8, #0x8]\n"
+    "fmla v15.4s, v7.4s, v12.4s\n"
+    "mov v29.16b, v30.16b\n fmla v29.4s, v6.4s, v22.4s\n"
+    "ldr q22, [x20, x15]\n"
+    "ldr x28, [x16, #0x88]\n"
+    "fmla v23.4s, v7.4s, v9.4s\n"
+    "fmla v10.4s, v6.4s, v12.4s\n"
+    "ldr x10, [x8, #0x10]\n"
+    "ldr x9, [x8, #0x18]\n"
+    "mov v21.16b, v30.16b\n fmla v21.4s, v3.4s, v12.4s\n"
+    "mov v19.16b, v30.16b\n fmla v19.4s, v0.4s, v12.4s\n"
+    "ldr q11, [x22, x15]\n"
+    "ldr x22, [x16, #0x78]\n"
+    "mov v24.16b, v30.16b\n fmla v24.4s, v8.4s, v18.4s\n"
+    "ldr q12, [x26, x15]\n"
+    "fmla v25.4s, v6.4s, v9.4s\n"
+    "ldr x20, [x16, #0x80]\n"
+    "fmla v28.4s, v4.4s, v9.4s\n"
+    "fmla v20.4s, v3.4s, v9.4s\n"
+    "add x14, x14, #0x10\n"
+    "mov v26.16b, v30.16b\n fmla v26.4s, v1.4s, v9.4s\n"
+    "mov v18.16b, v30.16b\n fmla v18.4s, v0.4s, v9.4s\n"
+    "ldr q30, [x17, #0x0]\n"
+    "fmla v27.4s, v8.4s, v9.4s\n"
+    "fmla v31.4s, v5.4s, v9.4s\n"
+    "fmla v29.4s, v2.4s, v9.4s\n"
+    "ldr q9, [x25, x15]\n"
+    "fmla v17.4s, v1.4s, v11.4s\n"
+    "ldr x27, [x16, #0x90]\n"
+    "fmla v16.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x21, x15]\n"
+    "fmla v15.4s, v2.4s, v12.4s\n"
+    "ldr x21, [x16, #0x98]\n"
+    "fmla v23.4s, v8.4s, v22.4s\n"
+    "fmla v10.4s, v1.4s, v12.4s\n"
+    "ldr q12, [x24, x15]\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v25.4s, v7.4s, v22.4s\n"
+    "fmla v21.4s, v6.4s, v22.4s\n"
+    "fmla v28.4s, v5.4s, v22.4s\n"
+    "fmla v20.4s, v4.4s, v22.4s\n"
+    "fmla v19.4s, v3.4s, v22.4s\n"
+    "fmla v26.4s, v2.4s, v22.4s\n"
+    "fmla v18.4s, v1.4s, v22.4s\n"
+    "fmla v24.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x23, x15]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v17.4s, v3.4s, v9.4s\n"
+    "fmla v27.4s, v0.4s, v9.4s\n"
+    "fmla v31.4s, v6.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "ldr q9, [x13, x15]\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v16.4s, v4.4s, v22.4s\n"
+    "fmla v15.4s, v3.4s, v22.4s\n"
+    "fmla v23.4s, v1.4s, v22.4s\n"
+    "fmla v10.4s, v5.4s, v11.4s\n"
+    "fmla v21.4s, v2.4s, v11.4s\n"
+    "ldr q12, [x22, x15]\n"
+    "fmla v25.4s, v0.4s, v22.4s\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v19.4s, v8.4s, v9.4s\n"
+    "fmla v24.4s, v5.4s, v9.4s\n"
+    "ldr q11, [x20, x15]\n"
+    "ldr x22, [x16, #0xc0]\n"
+    "fmla v17.4s, v5.4s, v22.4s\n"
+    "fmla v27.4s, v2.4s, v22.4s\n"
+    "ldr q22, [x28, x15]\n"
+    "ldr x20, [x16, #0xc8]\n"
+    "fmla v16.4s, v5.4s, v12.4s\n"
+    "fmla v15.4s, v4.4s, v12.4s\n"
+    "fmla v23.4s, v2.4s, v12.4s\n"
+    "fmla v10.4s, v3.4s, v12.4s\n"
+    "fmla v25.4s, v1.4s, v12.4s\n"
+    "fmla v21.4s, v0.4s, v12.4s\n"
+    "ldr q9, [x21, x15]\n"
+    "ldr x28, [x16, #0xd8]\n"
+    "fmla v29.4s, v7.4s, v11.4s\n"
+    "fmla v26.4s, v6.4s, v11.4s\n"
+    "ldr q12, [x27, x15]\n"
+    "ldr x21, [x16, #0xd0]\n"
+    "fmla v17.4s, v7.4s, v22.4s\n"
+    "fmla v16.4s, v6.4s, v22.4s\n"
+    "fmla v27.4s, v4.4s, v22.4s\n"
+    "fmla v23.4s, v3.4s, v22.4s\n"
+    "fmla v31.4s, v1.4s, v22.4s\n"
+    "fmla v28.4s, v0.4s, v22.4s\n"
+    "ldr q11, [x26, x15]\n"
+    "ldr x27, [x16, #0xe0]\n"
+    "fmla v15.4s, v8.4s, v9.4s\n"
+    "fmla v18.4s, v8.4s, v12.4s\n"
+    "fmla v24.4s, v7.4s, v12.4s\n"
+    "ldr q12, [x25, x15]\n"
+    "fmla v19.4s, v1.4s, v9.4s\n"
+    "ldr x26, [x16, #0xe8]\n"
+    "fmla v10.4s, v7.4s, v9.4s\n"
+    "fmla v25.4s, v5.4s, v9.4s\n"
+    "fmla v21.4s, v4.4s, v9.4s\n"
+    "fmla v20.4s, v2.4s, v9.4s\n"
+    "ldr q9, [x24, x15]\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla v17.4s, v2.4s, v11.4s\n"
+    "fmla v16.4s, v1.4s, v11.4s\n"
+    "fmla v15.4s, v0.4s, v11.4s\n"
+    "ldr q22, [x23, x15]\n"
+    "fmla v27.4s, v7.4s, v12.4s\n"
+    "ldr x25, [x16, #0xf8]\n"
+    "fmla v23.4s, v6.4s, v12.4s\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "fmla v26.4s, v0.4s, v12.4s\n"
+    "ldr q11, [x22, x15]\n"
+    "fmla v19.4s, v4.4s, v11.4s\n"
+    "ldr x23, [x16, #0x100]\n"
+    "fmla v18.4s, v2.4s, v11.4s\n"
+    "fmla v16.4s, v2.4s, v9.4s\n"
+    "fmla v15.4s, v1.4s, v9.4s\n"
+    "fmla v10.4s, v0.4s, v9.4s\n"
+    "ldr q9, [x20, x15]\n"
+    "ldr x20, [x16, #0x108]\n"
+    "fmla v17.4s, v6.4s, v22.4s\n"
+    "fmla v27.4s, v3.4s, v22.4s\n"
+    "fmla v31.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x21, x15]\n"
+    "fmla v25.4s, v8.4s, v11.4s\n"
+    "ldr x22, [x16, #0x110]\n"
+    "fmla v21.4s, v7.4s, v11.4s\n"
+    "fmla v20.4s, v5.4s, v11.4s\n"
+    "fmla v24.4s, v1.4s, v11.4s\n"
+    "ldr q12, [x28, x15]\n"
+    "fmla v19.4s, v2.4s, v9.4s\n"
+    "ldr x21, [x16, #0x118]\n"
+    "fmla v29.4s, v0.4s, v22.4s\n"
+    "fmla v26.4s, v4.4s, v12.4s\n"
+    "fmla v18.4s, v3.4s, v12.4s\n"
+    "fmla v10.4s, v8.4s, v9.4s\n"
+    "fmla v21.4s, v5.4s, v9.4s\n"
+    "ldr q11, [x27, x15]\n"
+    "fmla v27.4s, v6.4s, v22.4s\n"
+    "fmla v31.4s, v3.4s, v22.4s\n"
+    "ldr q22, [x26, x15]\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "fmla v20.4s, v6.4s, v12.4s\n"
+    "fmla v29.4s, v5.4s, v12.4s\n"
+    "fmla v19.4s, v5.4s, v11.4s\n"
+    "fmla v24.4s, v2.4s, v11.4s\n"
+    "fmla v26.4s, v7.4s, v22.4s\n"
+    "fmla v18.4s, v6.4s, v22.4s\n"
+    "fmla v31.4s, v8.4s, v12.4s\n"
+    "ldr q12, [x24, x15]\n"
+    "fmla v29.4s, v8.4s, v22.4s\n"
+    "ldr q22, [x23, x15]\n"
+    "fmla v28.4s, v8.4s, v12.4s\n"
+    "fmla v20.4s, v7.4s, v12.4s\n"
+    "fmla v19.4s, v6.4s, v12.4s\n"
+    "fmla v26.4s, v5.4s, v12.4s\n"
+    "fmla v18.4s, v4.4s, v12.4s\n"
+    "fmla v24.4s, v3.4s, v12.4s\n"
+    "ldr q12, [x20, x15]\n"
+    "ldp x20, x24, [x16, #0x0]\n"
+    "ldr q9, [x20, x6]\n"
+    "fmla v21.4s, v8.4s, v11.4s\n"
+    "ldr q11, [x25, x15]\n"
+    "fmla v17.4s, v4.4s, v22.4s\n"
+    "fmla v16.4s, v3.4s, v22.4s\n"
+    "fmla v15.4s, v5.4s, v12.4s\n"
+    "fmax v17.4s, v17.4s, v13.4s\n"
+    "fmla v10.4s, v4.4s, v12.4s\n"
+    "fmla v26.4s, v8.4s, v11.4s\n"
+    "fmax v16.4s, v16.4s, v13.4s\n"
+    "fmla v18.4s, v7.4s, v11.4s\n"
+    "fmla v24.4s, v6.4s, v11.4s\n"
+    "ldr q11, [x22, x15]\n"
+    "fmax v15.4s, v15.4s, v13.4s\n"
+    "fmla v27.4s, v1.4s, v22.4s\n"
+    "fmla v23.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x21, x15]\n"
+    "ldr q0, [x17, #0x10]\n"
+    "fmla v25.4s, v2.4s, v12.4s\n"
+    "ldr q2, [x17, #0x30]\n"
+    "fmla v21.4s, v1.4s, v12.4s\n"
+    "ldr q1, [x17, #0x20]\n"
+    "fmax v10.4s, v10.4s, v13.4s\n"
+    "fmla v31.4s, v7.4s, v11.4s\n"
+    "fmla v28.4s, v6.4s, v11.4s\n"
+    "ldr q6, [x17, #0x70]\n"
+    "fmla v20.4s, v8.4s, v22.4s\n"
+    "ldr q8, [x17, #0x90]\n"
+    "fmla v19.4s, v7.4s, v22.4s\n"
+    "ldr q7, [x17, #0x80]\n"
+    "fmin v17.4s, v17.4s, v14.4s\n"
+    "fmin v16.4s, v16.4s, v14.4s\n"
+    "str q17, [x12, x14]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "fmin v15.4s, v15.4s, v14.4s\n"
+    "fmin v10.4s, v10.4s, v14.4s\n"
+    "str q16, [x11, x14]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "fmax v27.4s, v27.4s, v13.4s\n"
+    "fmax v23.4s, v23.4s, v13.4s\n"
+    "str q15, [x10, x14]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "fmax v25.4s, v25.4s, v13.4s\n"
+    "fmax v21.4s, v21.4s, v13.4s\n"
+    "str q10, [x9, x14]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "fmla v26.4s, v3.4s, v11.4s\n"
+    "ldr q3, [x17, #0x40]\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "fmla v18.4s, v5.4s, v22.4s\n"
+    "ldr q5, [x17, #0x60]\n"
+    "fmla v24.4s, v4.4s, v22.4s\n"
+    "ldr q10, [x24, x6]\n"
+    "ldr q4, [x17, #0x50]\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "str q27, [x23, x14]\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "fmax v31.4s, v31.4s, v13.4s\n"
+    "str q23, [x22, x14]\n"
+    "ldr x25, [x8, #0x40]\n"
+    "fmax v28.4s, v28.4s, v13.4s\n"
+    "fmax v20.4s, v20.4s, v13.4s\n"
+    "str q25, [x21, x14]\n"
+    "ldr x23, [x8, #0x48]\n"
+    "fmax v19.4s, v19.4s, v13.4s\n"
+    "str q21, [x20, x14]\n"
+    "ldr x22, [x8, #0x50]\n"
+    "ldr x24, [x8, #0x58]\n"
+    "ldp x21, x20, [x16, #0x10]\n"
+    "ldr q11, [x21, x6]\n"
+    "fmin v31.4s, v31.4s, v14.4s\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "ldr q12, [x20, x6]\n"
+    "fmin v20.4s, v20.4s, v14.4s\n"
+    "fmin v19.4s, v19.4s, v14.4s\n"
+    "str q31, [x25, x14]\n"
+    "fmax v29.4s, v29.4s, v13.4s\n"
+    "fmax v26.4s, v26.4s, v13.4s\n"
+    "str q28, [x23, x14]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "fmax v18.4s, v18.4s, v13.4s\n"
+    "fmax v24.4s, v24.4s, v13.4s\n"
+    "str q20, [x22, x14]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "str q19, [x24, x14]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x6, x6, #0x10\n"
+    "cmp x6, x7, LSL #4\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "add x15, x15, #0x10\n"
+    "fmin v18.4s, v18.4s, v14.4s\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "str q29, [x23, x14]\n"
+    "add x17, x17, #0xa0\n"
+    "str q26, [x22, x14]\n"
+    "str q18, [x21, x14]\n"
+    "str q24, [x20, x14]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v30.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+    "mov v17.16b, v30.16b\n fmla v17.4s, v8.4s, v9.4s\n"
+    "ldr x27, [x16, #0x20]\n"
+    "ldr x24, [x16, #0x30]\n"
+    "mov v15.16b, v30.16b\n fmla v15.4s, v3.4s, v9.4s\n"
+    "mov v29.16b, v30.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+    "ldr x23, [x16, #0x28]\n"
+    "ldr x22, [x16, #0x38]\n"
+    "mov v19.16b, v30.16b\n fmla v19.4s, v0.4s, v9.4s\n"
+    "mov v20.16b, v30.16b\n fmla v20.4s, v7.4s, v9.4s\n"
+    "ldr x26, [x16, #0x40]\n"
+    "ldr x21, [x16, #0x48]\n"
+    "mov v21.16b, v30.16b\n fmla v21.4s, v6.4s, v9.4s\n"
+    "fmla v31.4s, v5.4s, v12.4s\n"
+    "ldr x25, [x16, #0x50]\n"
+    "ldr x20, [x16, #0x58]\n"
+    "mov v18.16b, v30.16b\n fmla v18.4s, v5.4s, v9.4s\n"
+    "mov v27.16b, v30.16b\n fmla v27.4s, v2.4s, v9.4s\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr x13, [x16, #0x70]\n"
+    "fmla v17.4s, v0.4s, v10.4s\n"
+    "ldr q22, [x27, x15]\n"
+    "mov v28.16b, v30.16b\n fmla v28.4s, v2.4s, v11.4s\n"
+    "ldr q16, [x23, x15]\n"
+    "fmla v15.4s, v4.4s, v12.4s\n"
+    "fmla v29.4s, v2.4s, v12.4s\n"
+    "ldr x24, [x16, #0x60]\n"
+    "ldr x23, [x16, #0x68]\n"
+    "fmla v19.4s, v1.4s, v12.4s\n"
+    "fmla v20.4s, v8.4s, v12.4s\n"
+    "ldr x12, [x8, #0x0]\n"
+    "ldr x11, [x8, #0x8]\n"
+    "fmla v21.4s, v7.4s, v12.4s\n"
+    "mov v10.16b, v30.16b\n fmla v10.4s, v6.4s, v22.4s\n"
+    "ldr q22, [x21, x15]\n"
+    "ldr x28, [x16, #0x88]\n"
+    "fmla v31.4s, v7.4s, v24.4s\n"
+    "fmla v28.4s, v6.4s, v12.4s\n"
+    "ldr x10, [x8, #0x10]\n"
+    "ldr x9, [x8, #0x18]\n"
+    "mov v9.16b, v30.16b\n fmla v9.4s, v3.4s, v12.4s\n"
+    "mov v11.16b, v30.16b\n fmla v11.4s, v0.4s, v12.4s\n"
+    "ldr q23, [x22, x15]\n"
+    "ldr x22, [x16, #0x78]\n"
+    "mov v12.16b, v30.16b\n fmla v12.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x26, x15]\n"
+    "fmla v15.4s, v6.4s, v24.4s\n"
+    "ldr x21, [x16, #0x80]\n"
+    "fmla v29.4s, v4.4s, v24.4s\n"
+    "fmla v19.4s, v3.4s, v24.4s\n"
+    "add x14, x14, #0x10\n"
+    "mov v26.16b, v30.16b\n fmla v26.4s, v1.4s, v24.4s\n"
+    "mov v25.16b, v30.16b\n fmla v25.4s, v0.4s, v24.4s\n"
+    "fmla v18.4s, v8.4s, v24.4s\n"
+    "fmla v27.4s, v5.4s, v24.4s\n"
+    "fmla v10.4s, v2.4s, v24.4s\n"
+    "ldr q24, [x25, x15]\n"
+    "fmla v17.4s, v1.4s, v23.4s\n"
+    "ldr x27, [x16, #0x90]\n"
+    "fmla v20.4s, v0.4s, v23.4s\n"
+    "ldr q23, [x20, x15]\n"
+    "fmla v21.4s, v2.4s, v16.4s\n"
+    "ldr x20, [x16, #0x98]\n"
+    "fmla v31.4s, v8.4s, v22.4s\n"
+    "fmla v28.4s, v1.4s, v16.4s\n"
+    "ldr q16, [x24, x15]\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v15.4s, v7.4s, v22.4s\n"
+    "fmla v9.4s, v6.4s, v22.4s\n"
+    "fmla v29.4s, v5.4s, v22.4s\n"
+    "fmla v19.4s, v4.4s, v22.4s\n"
+    "fmla v11.4s, v3.4s, v22.4s\n"
+    "fmla v26.4s, v2.4s, v22.4s\n"
+    "fmla v25.4s, v1.4s, v22.4s\n"
+    "fmla v12.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x23, x15]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v17.4s, v3.4s, v24.4s\n"
+    "fmla v18.4s, v0.4s, v24.4s\n"
+    "fmla v27.4s, v6.4s, v16.4s\n"
+    "fmla v10.4s, v3.4s, v16.4s\n"
+    "ldr q16, [x13, x15]\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v20.4s, v4.4s, v22.4s\n"
+    "fmla v21.4s, v3.4s, v22.4s\n"
+    "fmla v31.4s, v1.4s, v22.4s\n"
+    "fmla v28.4s, v5.4s, v23.4s\n"
+    "fmla v9.4s, v2.4s, v23.4s\n"
+    "ldr q23, [x22, x15]\n"
+    "fmla v15.4s, v0.4s, v22.4s\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v11.4s, v8.4s, v16.4s\n"
+    "fmla v12.4s, v5.4s, v16.4s\n"
+    "ldr q16, [x21, x15]\n"
+    "ldr x22, [x16, #0xc0]\n"
+    "fmla v17.4s, v5.4s, v22.4s\n"
+    "fmla v18.4s, v2.4s, v22.4s\n"
+    "ldr q22, [x28, x15]\n"
+    "ldr x21, [x16, #0xc8]\n"
+    "fmla v20.4s, v5.4s, v23.4s\n"
+    "fmla v21.4s, v4.4s, v23.4s\n"
+    "fmla v31.4s, v2.4s, v23.4s\n"
+    "fmla v28.4s, v3.4s, v23.4s\n"
+    "fmla v15.4s, v1.4s, v23.4s\n"
+    "fmla v9.4s, v0.4s, v23.4s\n"
+    "ldr q23, [x20, x15]\n"
+    "ldr x28, [x16, #0xd8]\n"
+    "fmla v10.4s, v7.4s, v16.4s\n"
+    "fmla v26.4s, v6.4s, v16.4s\n"
+    "ldr q16, [x27, x15]\n"
+    "ldr x20, [x16, #0xd0]\n"
+    "fmla v17.4s, v7.4s, v22.4s\n"
+    "fmla v20.4s, v6.4s, v22.4s\n"
+    "fmla v18.4s, v4.4s, v22.4s\n"
+    "fmla v31.4s, v3.4s, v22.4s\n"
+    "fmla v27.4s, v1.4s, v22.4s\n"
+    "fmla v29.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x26, x15]\n"
+    "ldr x27, [x16, #0xe0]\n"
+    "fmla v21.4s, v8.4s, v23.4s\n"
+    "fmla v25.4s, v8.4s, v16.4s\n"
+    "fmla v12.4s, v7.4s, v16.4s\n"
+    "ldr q16, [x25, x15]\n"
+    "fmla v11.4s, v1.4s, v23.4s\n"
+    "ldr x26, [x16, #0xe8]\n"
+    "fmla v28.4s, v7.4s, v23.4s\n"
+    "fmla v15.4s, v5.4s, v23.4s\n"
+    "fmla v9.4s, v4.4s, v23.4s\n"
+    "fmla v19.4s, v2.4s, v23.4s\n"
+    "ldr q23, [x24, x15]\n"
+    "ldr x25, [x16, #0xf0]\n"
+    "fmla v17.4s, v2.4s, v22.4s\n"
+    "fmla v20.4s, v1.4s, v22.4s\n"
+    "fmla v21.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x23, x15]\n"
+    "fmla v18.4s, v7.4s, v16.4s\n"
+    "ldr x24, [x16, #0xf8]\n"
+    "fmla v31.4s, v6.4s, v16.4s\n"
+    "fmla v27.4s, v4.4s, v16.4s\n"
+    "fmla v29.4s, v3.4s, v16.4s\n"
+    "fmla v10.4s, v1.4s, v16.4s\n"
+    "fmla v26.4s, v0.4s, v16.4s\n"
+    "ldr q16, [x22, x15]\n"
+    "fmla v11.4s, v4.4s, v16.4s\n"
+    "ldr x23, [x16, #0x100]\n"
+    "fmla v25.4s, v2.4s, v16.4s\n"
+    "fmla v20.4s, v2.4s, v23.4s\n"
+    "fmla v21.4s, v1.4s, v23.4s\n"
+    "fmla v28.4s, v0.4s, v23.4s\n"
+    "ldr q23, [x21, x15]\n"
+    "ldr x22, [x16, #0x108]\n"
+    "fmla v17.4s, v6.4s, v22.4s\n"
+    "fmla v18.4s, v3.4s, v22.4s\n"
+    "fmla v27.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x20, x15]\n"
+    "fmla v15.4s, v8.4s, v16.4s\n"
+    "ldr x21, [x16, #0x110]\n"
+    "fmla v9.4s, v7.4s, v16.4s\n"
+    "fmla v19.4s, v5.4s, v16.4s\n"
+    "fmla v12.4s, v1.4s, v16.4s\n"
+    "ldr q16, [x28, x15]\n"
+    "fmla v11.4s, v2.4s, v23.4s\n"
+    "ldr x20, [x16, #0x118]\n"
+    "fmla v10.4s, v0.4s, v22.4s\n"
+    "fmla v26.4s, v4.4s, v16.4s\n"
+    "fmla v25.4s, v3.4s, v16.4s\n"
+    "fmla v28.4s, v8.4s, v23.4s\n"
+    "fmla v9.4s, v5.4s, v23.4s\n"
+    "ldr q23, [x27, x15]\n"
+    "fmla v18.4s, v6.4s, v22.4s\n"
+    "fmla v27.4s, v3.4s, v22.4s\n"
+    "ldr q22, [x26, x15]\n"
+    "fmla v29.4s, v7.4s, v16.4s\n"
+    "fmla v19.4s, v6.4s, v16.4s\n"
+    "fmla v10.4s, v5.4s, v16.4s\n"
+    "fmla v11.4s, v5.4s, v23.4s\n"
+    "fmla v12.4s, v2.4s, v23.4s\n"
+    "fmla v26.4s, v7.4s, v22.4s\n"
+    "fmla v25.4s, v6.4s, v22.4s\n"
+    "fmla v27.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x25, x15]\n"
+    "fmla v10.4s, v8.4s, v22.4s\n"
+    "ldr q30, [x23, x15]\n"
+    "fmla v29.4s, v8.4s, v16.4s\n"
+    "fmla v19.4s, v7.4s, v16.4s\n"
+    "fmla v11.4s, v6.4s, v16.4s\n"
+    "fmla v26.4s, v5.4s, v16.4s\n"
+    "fmla v25.4s, v4.4s, v16.4s\n"
+    "fmla v12.4s, v3.4s, v16.4s\n"
+    "ldr q24, [x22, x15]\n"
+    "fmla v9.4s, v8.4s, v23.4s\n"
+    "ldr q16, [x24, x15]\n"
+    "fmla v17.4s, v4.4s, v30.4s\n"
+    "fmax v17.4s, v17.4s, v13.4s\n"
+    "fmla v20.4s, v3.4s, v30.4s\n"
+    "fmla v21.4s, v5.4s, v24.4s\n"
+    "fmax v20.4s, v20.4s, v13.4s\n"
+    "fmla v28.4s, v4.4s, v24.4s\n"
+    "fmla v26.4s, v8.4s, v16.4s\n"
+    "fmax v21.4s, v21.4s, v13.4s\n"
+    "fmla v25.4s, v7.4s, v16.4s\n"
+    "fmla v12.4s, v6.4s, v16.4s\n"
+    "ldr q23, [x21, x15]\n"
+    "fmax v28.4s, v28.4s, v13.4s\n"
+    "fmla v18.4s, v1.4s, v30.4s\n"
+    "fmla v31.4s, v0.4s, v30.4s\n"
+    "ldr q16, [x20, x15]\n"
+    "fmin v17.4s, v17.4s, v14.4s\n"
+    "fmla v15.4s, v2.4s, v24.4s\n"
+    "fmla v9.4s, v1.4s, v24.4s\n"
+    "fmin v20.4s, v20.4s, v14.4s\n"
+    "str q17, [x12, x14]\n"
+    "fmla v27.4s, v7.4s, v23.4s\n"
+    "fmla v29.4s, v6.4s, v23.4s\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "str q20, [x11, x14]\n"
+    "fmla v19.4s, v8.4s, v16.4s\n"
+    "fmla v11.4s, v7.4s, v16.4s\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "str q21, [x10, x14]\n"
+    "fmax v18.4s, v18.4s, v13.4s\n"
+    "fmax v31.4s, v31.4s, v13.4s\n"
+    "str q28, [x9, x14]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "fmax v15.4s, v15.4s, v13.4s\n"
+    "fmax v9.4s, v9.4s, v13.4s\n"
+    "ldr x22, [x8, #0x28]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "fmla v10.4s, v4.4s, v23.4s\n"
+    "fmla v26.4s, v3.4s, v23.4s\n"
+    "fmin v18.4s, v18.4s, v14.4s\n"
+    "fmla v25.4s, v5.4s, v16.4s\n"
+    "fmla v12.4s, v4.4s, v16.4s\n"
+    "fmin v31.4s, v31.4s, v14.4s\n"
+    "str q18, [x23, x14]\n"
+    "fmin v15.4s, v15.4s, v14.4s\n"
+    "fmin v9.4s, v9.4s, v14.4s\n"
+    "str q31, [x22, x14]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "fmax v27.4s, v27.4s, v13.4s\n"
+    "fmax v29.4s, v29.4s, v13.4s\n"
+    "str q15, [x21, x14]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "fmax v19.4s, v19.4s, v13.4s\n"
+    "fmax v11.4s, v11.4s, v13.4s\n"
+    "str q9, [x20, x14]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "str q27, [x23, x14]\n"
+    "fmin v19.4s, v19.4s, v14.4s\n"
+    "fmin v11.4s, v11.4s, v14.4s\n"
+    "str q29, [x22, x14]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "fmax v10.4s, v10.4s, v13.4s\n"
+    "fmax v26.4s, v26.4s, v13.4s\n"
+    "str q19, [x21, x14]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "fmax v25.4s, v25.4s, v13.4s\n"
+    "fmax v12.4s, v12.4s, v13.4s\n"
+    "str q11, [x20, x14]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "fmin v10.4s, v10.4s, v14.4s\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "str q10, [x23, x14]\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "fmin v12.4s, v12.4s, v14.4s\n"
+    "str q26, [x22, x14]\n"
+    "add x15, x15, #0x10\n"
+    "str q25, [x21, x14]\n"
+    "str q12, [x20, x14]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 72f\n"
+    "ldr q30, [x17, #0x0]\n"
+    "ldr q0, [x17, #0x10]\n"
+    "mov x14, x15\n"
+    "ldr q1, [x17, #0x20]\n"
+    "ldr q2, [x17, #0x30]\n"
+    "ldr q3, [x17, #0x40]\n"
+    "ldr q4, [x17, #0x50]\n"
+    "ldr q5, [x17, #0x60]\n"
+    "ldr q6, [x17, #0x70]\n"
+    "ldr q7, [x17, #0x80]\n"
+    "ldr q8, [x17, #0x90]\n"
+    "ldr x23, [x16, #0x0]\n"
+    "ldr x22, [x16, #0x8]\n"
+    "add x23, x23, x15\n"
+    "add x22, x22, x15\n"
+    "ldr x21, [x16, #0x10]\n"
+    "ldr x20, [x16, #0x18]\n"
+    "add x21, x21, x15\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.d }[0], [x23], #0x8\n"
+    "ld1 { v10.d }[0], [x22], #0x8\n"
+    "ld1 { v11.d }[0], [x21], #0x8\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v9.s }[2], [x23], #0x4\n"
+    "ld1 { v10.s }[2], [x22], #0x4\n"
+    "ld1 { v11.s }[2], [x21], #0x4\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
+    "ld1 { v9.s }[0], [x23], #0x4\n"
+    "ld1 { v10.s }[0], [x22], #0x4\n"
+    "ld1 { v11.s }[0], [x21], #0x4\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
+    "mov v16.16b, v30.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+    "mov v17.16b, v30.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+    "ldr x20, [x16, #0x20]\n"
+    "add x20, x20, x15\n"
+    "mov v18.16b, v30.16b\n fmla v18.4s, v6.4s, v9.4s\n"
+    "mov v21.16b, v30.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+    "mov v22.16b, v30.16b\n fmla v22.4s, v3.4s, v9.4s\n"
+    "mov v25.16b, v30.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+    "mov v26.16b, v30.16b\n fmla v26.4s, v0.4s, v9.4s\n"
+    "mov v19.16b, v30.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+    "mov v20.16b, v30.16b\n fmla v20.4s, v5.4s, v9.4s\n"
+    "mov v24.16b, v30.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+    "fmla v16.4s, v0.4s, v10.4s\n"
+    "fmla v17.4s, v8.4s, v12.4s\n"
+    "fmla v18.4s, v7.4s, v12.4s\n"
+    "fmla v19.4s, v6.4s, v12.4s\n"
+    "fmla v21.4s, v5.4s, v12.4s\n"
+    "fmla v22.4s, v4.4s, v12.4s\n"
+    "mov v23.16b, v30.16b\n fmla v23.4s, v3.4s, v12.4s\n"
+    "fmla v25.4s, v2.4s, v12.4s\n"
+    "fmla v26.4s, v1.4s, v12.4s\n"
+    "mov v27.16b, v30.16b\n fmla v27.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (5, 0): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "7:"  // Oddments: Load input (5, 0): Bit 1: End
+    "ldr x20, [x16, #0x28]\n"
+    "mov v28.16b, v30.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (5, 5): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "9:"  // Oddments: Load input (5, 5): Bit 1: End
+    "ldr x20, [x16, #0x30]\n"
+    "mov v31.16b, v30.16b\n fmla v31.4s, v8.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "11:"  // Oddments: Load input (3, 2): Bit 1: End
+    "ldr x20, [x16, #0x38]\n"
+    "fmla v20.4s, v8.4s, v9.4s\n"
+    "fmla v21.4s, v7.4s, v9.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v22.4s, v6.4s, v9.4s\n"
+    "fmla v24.4s, v5.4s, v9.4s\n"
+    "fmla v25.4s, v4.4s, v9.4s\n"
+    "fmla v26.4s, v3.4s, v9.4s\n"
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "mov v29.16b, v30.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+    "fmla v30.4s, v0.4s, v9.4s\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (0, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "13:"  // Oddments: Load input (0, 1): Bit 1: End
+    "ldr x20, [x16, #0x40]\n"
+    "fmla v16.4s, v1.4s, v12.4s\n"
+    "fmla v17.4s, v0.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (0, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "15:"  // Oddments: Load input (0, 4): Bit 1: End
+    "ldr x20, [x16, #0x48]\n"
+    "fmla v18.4s, v2.4s, v11.4s\n"
+    "fmla v19.4s, v1.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "17:"  // Oddments: Load input (3, 3): Bit 1: End
+    "ldr x20, [x16, #0x50]\n"
+    "fmla v21.4s, v8.4s, v10.4s\n"
+    "fmla v22.4s, v7.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v23.4s, v6.4s, v10.4s\n"
+    "fmla v25.4s, v5.4s, v10.4s\n"
+    "fmla v26.4s, v4.4s, v10.4s\n"
+    "fmla v27.4s, v3.4s, v10.4s\n"
+    "fmla v29.4s, v2.4s, v10.4s\n"
+    "fmla v30.4s, v1.4s, v10.4s\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (1, 0): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "19:"  // Oddments: Load input (1, 0): Bit 1: End
+    "ldr x20, [x16, #0x58]\n"
+    "fmla v16.4s, v3.4s, v9.4s\n"
+    "fmla v20.4s, v0.4s, v9.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (1, 5): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "21:"  // Oddments: Load input (1, 5): Bit 1: End
+    "ldr x20, [x16, #0x60]\n"
+    "fmla v19.4s, v5.4s, v12.4s\n"
+    "fmla v23.4s, v2.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (4, 0): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "23:"  // Oddments: Load input (4, 0): Bit 1: End
+    "ldr x20, [x16, #0x68]\n"
+    "fmla v24.4s, v6.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (1, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "25:"  // Oddments: Load input (1, 2): Bit 1: End
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v16.4s, v5.4s, v10.4s\n"
+    "fmla v17.4s, v4.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v18.4s, v3.4s, v10.4s\n"
+    "fmla v20.4s, v2.4s, v10.4s\n"
+    "fmla v21.4s, v1.4s, v10.4s\n"
+    "fmla v22.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (4, 5): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "27:"  // Oddments: Load input (4, 5): Bit 1: End
+    "ldr x20, [x16, #0x78]\n"
+    "fmla v27.4s, v8.4s, v11.4s\n"
+    "fmla v31.4s, v5.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 29f\n"
+    "28:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "29:"  // Oddments: Load input (1, 3): Bit 1: End
+    "ldr x20, [x16, #0x80]\n"
+    "fmla v17.4s, v5.4s, v12.4s\n"
+    "fmla v18.4s, v4.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v19.4s, v3.4s, v12.4s\n"
+    "fmla v21.4s, v2.4s, v12.4s\n"
+    "fmla v22.4s, v1.4s, v12.4s\n"
+    "fmla v23.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (5, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "31:"  // Oddments: Load input (5, 1): Bit 1: End
+    "ldr x20, [x16, #0x88]\n"
+    "fmla v28.4s, v7.4s, v11.4s\n"
+    "fmla v29.4s, v6.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 33f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 33f\n"
+    "32:"  // Oddments: Load input (2, 1): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "33:"  // Oddments: Load input (2, 1): Bit 1: End
+    "ldr x20, [x16, #0x90]\n"
+    "fmla v16.4s, v7.4s, v10.4s\n"
+    "fmla v17.4s, v6.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v20.4s, v4.4s, v10.4s\n"
+    "fmla v21.4s, v3.4s, v10.4s\n"
+    "fmla v24.4s, v1.4s, v10.4s\n"
+    "fmla v25.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (5, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "35:"  // Oddments: Load input (5, 4): Bit 1: End
+    "ldr x20, [x16, #0x98]\n"
+    "fmla v30.4s, v8.4s, v11.4s\n"
+    "fmla v31.4s, v7.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 37f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 37f\n"
+    "36:"  // Oddments: Load input (2, 4): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "37:"  // Oddments: Load input (2, 4): Bit 1: End
+    "ldr x20, [x16, #0xa0]\n"
+    "fmla v18.4s, v8.4s, v12.4s\n"
+    "fmla v19.4s, v7.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v22.4s, v5.4s, v12.4s\n"
+    "fmla v23.4s, v4.4s, v12.4s\n"
+    "fmla v26.4s, v2.4s, v12.4s\n"
+    "fmla v27.4s, v1.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (0, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "39:"  // Oddments: Load input (0, 2): Bit 1: End
+    "ldr x20, [x16, #0xa8]\n"
+    "fmla v16.4s, v2.4s, v10.4s\n"
+    "fmla v17.4s, v1.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v18.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 41f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 41f\n"
+    "40:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "41:"  // Oddments: Load input (3, 1): Bit 1: End
+    "ldr x20, [x16, #0xb0]\n"
+    "fmla v20.4s, v7.4s, v11.4s\n"
+    "fmla v21.4s, v6.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v24.4s, v4.4s, v11.4s\n"
+    "fmla v25.4s, v3.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (0, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "43:"  // Oddments: Load input (0, 3): Bit 1: End
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla v17.4s, v2.4s, v12.4s\n"
+    "fmla v18.4s, v1.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v19.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 45f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 45f\n"
+    "44:"  // Oddments: Load input (2, 0): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "45:"  // Oddments: Load input (2, 0): Bit 1: End
+    "ldr x20, [x16, #0xc0]\n"
+    "fmla v16.4s, v6.4s, v10.4s\n"
+    "fmla v20.4s, v3.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v24.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (3, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "47:"  // Oddments: Load input (3, 4): Bit 1: End
+    "ldr x20, [x16, #0xc8]\n"
+    "fmla v22.4s, v8.4s, v11.4s\n"
+    "fmla v23.4s, v7.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v26.4s, v5.4s, v11.4s\n"
+    "fmla v27.4s, v4.4s, v11.4s\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v31.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 49f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 49f\n"
+    "48:"  // Oddments: Load input (2, 5): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "49:"  // Oddments: Load input (2, 5): Bit 1: End
+    "ldr x20, [x16, #0xd0]\n"
+    "fmla v19.4s, v8.4s, v12.4s\n"
+    "fmla v23.4s, v5.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v27.4s, v2.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "51:"  // Oddments: Load input (3, 0): Bit 1: End
+    "ldr x20, [x16, #0xd8]\n"
+    "fmla v20.4s, v6.4s, v10.4s\n"
+    "fmla v24.4s, v3.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 53f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 53f\n"
+    "52:"  // Oddments: Load input (4, 2): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "53:"  // Oddments: Load input (4, 2): Bit 1: End
+    "ldr x20, [x16, #0xe0]\n"
+    "fmla v24.4s, v8.4s, v11.4s\n"
+    "fmla v25.4s, v7.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v26.4s, v6.4s, v11.4s\n"
+    "fmla v28.4s, v5.4s, v11.4s\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 54f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load input (3, 5): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "55:"  // Oddments: Load input (3, 5): Bit 1: End
+    "ldr x20, [x16, #0xe8]\n"
+    "fmla v23.4s, v8.4s, v12.4s\n"
+    "fmla v27.4s, v5.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v31.4s, v2.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 56f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 57f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 57f\n"
+    "56:"  // Oddments: Load input (5, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "57:"  // Oddments: Load input (5, 2): Bit 1: End
+    "ldr x20, [x16, #0xf0]\n"
+    "fmla v28.4s, v8.4s, v10.4s\n"
+    "fmla v29.4s, v7.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v30.4s, v6.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 58f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load input (4, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "59:"  // Oddments: Load input (4, 3): Bit 1: End
+    "ldr x20, [x16, #0xf8]\n"
+    "fmla v25.4s, v8.4s, v11.4s\n"
+    "fmla v26.4s, v7.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v27.4s, v6.4s, v11.4s\n"
+    "fmla v29.4s, v5.4s, v11.4s\n"
+    "fmla v30.4s, v4.4s, v11.4s\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 60f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 61f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 61f\n"
+    "60:"  // Oddments: Load input (5, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "61:"  // Oddments: Load input (5, 3): Bit 1: End
+    "ldr x20, [x16, #0x100]\n"
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "fmla v30.4s, v7.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v31.4s, v6.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 62f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load input (1, 1): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "63:"  // Oddments: Load input (1, 1): Bit 1: End
+    "ldr x20, [x16, #0x108]\n"
+    "fmla v16.4s, v4.4s, v10.4s\n"
+    "fmla v17.4s, v3.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v20.4s, v1.4s, v10.4s\n"
+    "fmla v21.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 64f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 65f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 65f\n"
+    "64:"  // Oddments: Load input (1, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "65:"  // Oddments: Load input (1, 4): Bit 1: End
+    "ldr x20, [x16, #0x110]\n"
+    "fmla v18.4s, v5.4s, v11.4s\n"
+    "fmla v19.4s, v4.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v22.4s, v2.4s, v11.4s\n"
+    "fmla v23.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 66f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load input (4, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "67:"  // Oddments: Load input (4, 1): Bit 1: End
+    "ldr x20, [x16, #0x118]\n"
+    "fmla v24.4s, v7.4s, v12.4s\n"
+    "fmla v25.4s, v6.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v28.4s, v4.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 68f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 69f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 69f\n"
+    "68:"  // Oddments: Load input (4, 4): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "69:"  // Oddments: Load input (4, 4): Bit 1: End
+    "fmla v26.4s, v8.4s, v10.4s\n"
+    "fmla v27.4s, v7.4s, v10.4s\n"
+    "fmax v16.4s, v16.4s, v13.4s\n"
+    "fmla v30.4s, v5.4s, v10.4s\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "fmax v17.4s, v17.4s, v13.4s\n"
+    "fmax v18.4s, v18.4s, v13.4s\n"
+    "fmax v19.4s, v19.4s, v13.4s\n"
+    "fmax v20.4s, v20.4s, v13.4s\n"
+    "fmax v21.4s, v21.4s, v13.4s\n"
+    "fmax v22.4s, v22.4s, v13.4s\n"
+    "fmax v23.4s, v23.4s, v13.4s\n"
+    "fmax v24.4s, v24.4s, v13.4s\n"
+    "fmax v25.4s, v25.4s, v13.4s\n"
+    "fmax v26.4s, v26.4s, v13.4s\n"
+    "fmax v27.4s, v27.4s, v13.4s\n"
+    "fmax v28.4s, v28.4s, v13.4s\n"
+    "fmax v29.4s, v29.4s, v13.4s\n"
+    "fmax v30.4s, v30.4s, v13.4s\n"
+    "fmax v31.4s, v31.4s, v13.4s\n"
+    "fmin v16.4s, v16.4s, v14.4s\n"
+    "fmin v17.4s, v17.4s, v14.4s\n"
+    "fmin v18.4s, v18.4s, v14.4s\n"
+    "fmin v19.4s, v19.4s, v14.4s\n"
+    "fmin v20.4s, v20.4s, v14.4s\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "fmin v22.4s, v22.4s, v14.4s\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "fmin v30.4s, v30.4s, v14.4s\n"
+    "fmin v31.4s, v31.4s, v14.4s\n"
+    "tbz %x[n_channels], #1, 70f\n"
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.d }[0], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.d }[0], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.d }[0], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.d }[0], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.d }[0], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.d }[0], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.d }[0], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.d }[0], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.d }[0], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.d }[0], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.d }[0], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.d }[0], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "add x14, x14, #0x8\n"
+    "st1 { v28.d }[0], [x23]\n"
+    "st1 { v29.d }[0], [x22]\n"
+    "st1 { v30.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.s }[2], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.s }[2], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.s }[2], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.s }[2], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.s }[2], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.s }[2], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.s }[2], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.s }[2], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.s }[2], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.s }[2], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.s }[2], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.s }[2], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "st1 { v28.s }[2], [x23]\n"
+    "st1 { v29.s }[2], [x22]\n"
+    "st1 { v30.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Store: Bit 1: Unset
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.s }[0], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.s }[0], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.s }[0], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.s }[0], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.s }[0], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.s }[0], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.s }[0], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.s }[0], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.s }[0], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.s }[0], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.s }[0], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.s }[0], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "st1 { v28.s }[0], [x23]\n"
+    "st1 { v29.s }[0], [x22]\n"
+    "st1 { v30.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "71:"  // Oddments: Store: Bit 1: End
+    "72:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..f727efea80
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..5ab61fad4c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,615 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x23, #0x0\n"
+    "mov x27, #0x0\n"
+    "1:"  // Tile loop
+    "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x26, #0x4\n"
+    "mov x25, #0x2\n"
+    "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x23, x24\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x27, x6, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x7, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "lsl x6, x6, #0x2\n"
+    "mul x20, x23, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "mov x23, #0x10\n"  // cntb _, ALL, #1
+    "mul x22, x22, x26\n"  // offset *= kernel_stride * output_size
+    "add x8, x8, x22, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x16, x8, x24, LSL #2\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x27, x7, x20\n"  // offset += tile_j * ld_output_col
+    "lsr x22, %x[n_channels], #0x2\n"
+    "add x14, x16, x24, LSL #2\n"
+    "mul x20, x20, x25\n"  // offset *= output_tile_size
+    "add x13, x6, x6\n"
+    "add x12, x14, x24, LSL #2\n"
+    "add x11, x13, x6\n"
+    "add x17, x17, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v26.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v27.4s }, [x20]\n"
+    "add x10, x12, x24, LSL #2\n"
+    "add x9, x11, x6\n"
+    "add x28, x17, x21, LSL #2\n"
+    "lsl x7, x7, #0x2\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x23\n"
+    "cbz x22, 4f\n"
+    "ldr q31, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "cmp x23, x22, LSL #4\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "ldr q9, [x14, x13]\n"
+    "ld1 { v10.4s }, [x8]\n"
+    "ldr q11, [x8, x6]\n"
+    "ldr q12, [x8, x11]\n"
+    "ldr q13, [x8, x9]\n"
+    "ld1 { v14.4s }, [x16]\n"
+    "ldr q15, [x16, x6]\n"
+    "ldr q16, [x8, x13]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+    "mov v28.16b, v31.16b\n fmla v28.4s, v6.4s, v9.4s\n"
+    "add x23, x23, #0x10\n"
+    "add x8, x8, #0x10\n"
+    "fmla v29.4s, v0.4s, v10.4s\n"
+    "ld1 { v10.4s }, [x8]\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "ldr q21, [x16, x9]\n"
+    "fmla v29.4s, v1.4s, v11.4s\n"
+    "ldr q18, [x16, x11]\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "ldr q17, [x16, x13]\n"
+    "fmla v29.4s, v3.4s, v14.4s\n"
+    "ld1 { v20.4s }, [x12]\n"
+    "fmla v28.4s, v0.4s, v16.4s\n"
+    "add x16, x16, #0x10\n"
+    "fmla v29.4s, v4.4s, v15.4s\n"
+    "ld1 { v25.4s }, [x14]\n"
+    "fmla v28.4s, v4.4s, v18.4s\n"
+    "ldr q19, [x12, x6]\n"
+    "fmla v29.4s, v2.4s, v16.4s\n"
+    "ldr q18, [x14, x6]\n"
+    "fmla v28.4s, v5.4s, v21.4s\n"
+    "ldr q24, [x14, x11]\n"
+    "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+    "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
+    "ldr q31, [x15, #0x0]\n"
+    "cmp x23, x22, LSL #4\n"
+    "fmla v29.4s, v5.4s, v17.4s\n"
+    "fmla v28.4s, v3.4s, v17.4s\n"
+    "ldr q17, [x12, x11]\n"
+    "add x20, x20, #0x10\n"
+    "fmla v23.4s, v3.4s, v20.4s\n"
+    "ldr q16, [x12, x9]\n"
+    "fmla v22.4s, v4.4s, v17.4s\n"
+    "ldr q21, [x10, x6]\n"
+    "fmla v23.4s, v0.4s, v25.4s\n"
+    "ldr q0, [x15, #0x10]\n"
+    "fmla v22.4s, v1.4s, v24.4s\n"
+    "add x21, x21, #0x10\n"
+    "fmla v23.4s, v4.4s, v19.4s\n"
+    "ldr q20, [x14, x9]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "fmla v22.4s, v5.4s, v16.4s\n"
+    "ldr q19, [x10, x11]\n"
+    "fmla v29.4s, v6.4s, v25.4s\n"
+    "ld1 { v17.4s }, [x10]\n"
+    "fmla v23.4s, v1.4s, v18.4s\n"
+    "ldr q1, [x15, #0x20]\n"
+    "fmla v22.4s, v2.4s, v20.4s\n"
+    "ldr q2, [x15, #0x30]\n"
+    "fmla v29.4s, v7.4s, v18.4s\n"
+    "ldr q16, [x12, x13]\n"
+    "fmla v23.4s, v6.4s, v17.4s\n"
+    "ldr q18, [x10, x13]\n"
+    "fmla v22.4s, v3.4s, v16.4s\n"
+    "ldr q3, [x15, #0x40]\n"
+    "fmla v23.4s, v7.4s, v21.4s\n"
+    "ldr q13, [x8, x9]\n"
+    "fmla v22.4s, v7.4s, v19.4s\n"
+    "ld1 { v14.4s }, [x16]\n"
+    "fmla v28.4s, v7.4s, v24.4s\n"
+    "ldr q12, [x8, x11]\n"
+    "fmla v23.4s, v5.4s, v16.4s\n"
+    "ldr q16, [x8, x13]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "fmla v22.4s, v6.4s, v18.4s\n"
+    "fmla v28.4s, v8.4s, v20.4s\n"
+    "ldr q17, [x10, x9]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "fmla v23.4s, v8.4s, v18.4s\n"
+    "fmla v22.4s, v8.4s, v17.4s\n"
+    "ldr q11, [x8, x6]\n"
+    "ldr q15, [x16, x6]\n"
+    "fmax v29.4s, v29.4s, v26.4s\n"
+    "fmax v28.4s, v28.4s, v26.4s\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "fmax v23.4s, v23.4s, v26.4s\n"
+    "fmax v22.4s, v22.4s, v26.4s\n"
+    "add x14, x14, #0x10\n"
+    "ldr q9, [x14, x13]\n"
+    "fmin v29.4s, v29.4s, v27.4s\n"
+    "fmin v28.4s, v28.4s, v27.4s\n"
+    "fmin v23.4s, v23.4s, v27.4s\n"
+    "fmin v22.4s, v22.4s, v27.4s\n"
+    "add x12, x12, #0x10\n"
+    "add x10, x10, #0x10\n"
+    "st1 { v29.4s }, [x17]\n"
+    "add x15, x15, #0xa0\n"
+    "str q28, [x17, x7]\n"
+    "add x17, x17, #0x10\n"
+    "st1 { v23.4s }, [x28]\n"
+    "str q22, [x28, x7]\n"
+    "add x28, x28, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+    "mov v28.16b, v31.16b\n fmla v28.4s, v6.4s, v9.4s\n"
+    "add x8, x8, #0x10\n"
+    "fmla v29.4s, v0.4s, v10.4s\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "ldr q20, [x16, x9]\n"
+    "fmla v29.4s, v1.4s, v11.4s\n"
+    "ldr q18, [x16, x11]\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "ldr q17, [x16, x13]\n"
+    "fmla v29.4s, v3.4s, v14.4s\n"
+    "ld1 { v19.4s }, [x12]\n"
+    "fmla v28.4s, v0.4s, v16.4s\n"
+    "add x16, x16, #0x10\n"
+    "fmla v29.4s, v4.4s, v15.4s\n"
+    "ld1 { v25.4s }, [x14]\n"
+    "fmla v28.4s, v4.4s, v18.4s\n"
+    "ldr q18, [x12, x6]\n"
+    "fmla v29.4s, v2.4s, v16.4s\n"
+    "ldr q24, [x14, x6]\n"
+    "fmla v28.4s, v5.4s, v20.4s\n"
+    "ldr q23, [x14, x11]\n"
+    "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
+    "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "fmla v29.4s, v5.4s, v17.4s\n"
+    "fmla v28.4s, v3.4s, v17.4s\n"
+    "ldr q17, [x12, x11]\n"
+    "fmla v22.4s, v3.4s, v19.4s\n"
+    "ldr q16, [x12, x9]\n"
+    "fmla v21.4s, v4.4s, v17.4s\n"
+    "ldr q20, [x10, x6]\n"
+    "fmla v22.4s, v0.4s, v25.4s\n"
+    "fmla v21.4s, v1.4s, v23.4s\n"
+    "fmla v22.4s, v4.4s, v18.4s\n"
+    "ldr q19, [x14, x9]\n"
+    "fmla v21.4s, v5.4s, v16.4s\n"
+    "ldr q18, [x10, x11]\n"
+    "fmla v29.4s, v6.4s, v25.4s\n"
+    "ld1 { v17.4s }, [x10]\n"
+    "fmla v22.4s, v1.4s, v24.4s\n"
+    "add x14, x14, #0x10\n"
+    "fmla v21.4s, v2.4s, v19.4s\n"
+    "fmla v29.4s, v7.4s, v24.4s\n"
+    "ldr q16, [x12, x13]\n"
+    "fmax v29.4s, v29.4s, v26.4s\n"
+    "fmla v22.4s, v6.4s, v17.4s\n"
+    "ldr q17, [x10, x13]\n"
+    "fmla v21.4s, v3.4s, v16.4s\n"
+    "fmin v29.4s, v29.4s, v27.4s\n"
+    "fmla v22.4s, v7.4s, v20.4s\n"
+    "fmla v21.4s, v7.4s, v18.4s\n"
+    "st1 { v29.4s }, [x17]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v28.4s, v7.4s, v23.4s\n"
+    "fmla v22.4s, v5.4s, v16.4s\n"
+    "fmla v21.4s, v6.4s, v17.4s\n"
+    "fmla v28.4s, v8.4s, v19.4s\n"
+    "ldr q16, [x10, x9]\n"
+    "fmax v28.4s, v28.4s, v26.4s\n"
+    "fmla v22.4s, v8.4s, v17.4s\n"
+    "fmla v21.4s, v8.4s, v16.4s\n"
+    "fmax v22.4s, v22.4s, v26.4s\n"
+    "add x10, x10, #0x10\n"
+    "fmax v21.4s, v21.4s, v26.4s\n"
+    "fmin v28.4s, v28.4s, v27.4s\n"
+    "str q28, [x17, x7]\n"
+    "add x17, x17, #0x10\n"
+    "fmin v22.4s, v22.4s, v27.4s\n"
+    "fmin v21.4s, v21.4s, v27.4s\n"
+    "st1 { v22.4s }, [x28]\n"
+    "str q21, [x28, x7]\n"
+    "add x28, x28, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 43f\n"
+    "ldr q31, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "add x27, x14, x13\n"
+    "add x26, x8, XZR\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "add x25, x8, x6\n"
+    "add x24, x8, x11\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "add x23, x8, x9\n"
+    "add x22, x16, XZR\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "add x21, x16, x6\n"
+    "add x20, x8, x13\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr d9, [x27], #0x8\n"
+    "ldr d10, [x26], #0x8\n"
+    "ldr d11, [x25], #0x8\n"
+    "ldr d12, [x24], #0x8\n"
+    "ldr d13, [x23], #0x8\n"
+    "ldr d14, [x22], #0x8\n"
+    "ldr d15, [x21], #0x8\n"
+    "ldr d16, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v9.s }[2], [x27]\n"
+    "ld1 { v10.s }[2], [x26]\n"
+    "ld1 { v11.s }[2], [x25]\n"
+    "ld1 { v12.s }[2], [x24]\n"
+    "ld1 { v13.s }[2], [x23]\n"
+    "ld1 { v14.s }[2], [x22]\n"
+    "ld1 { v15.s }[2], [x21]\n"
+    "ld1 { v16.s }[2], [x20]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
+    "ldr s9, [x27, #0x0]\n"
+    "ldr s10, [x26, #0x0]\n"
+    "ldr s11, [x25, #0x0]\n"
+    "ldr s12, [x24, #0x0]\n"
+    "ldr s13, [x23, #0x0]\n"
+    "ldr s14, [x22, #0x0]\n"
+    "ldr s15, [x21, #0x0]\n"
+    "ldr s16, [x20, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
+    "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "add x20, x16, x11\n"
+    "mov v29.16b, v31.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "fmla v28.4s, v3.4s, v14.4s\n"
+    "fmla v29.4s, v2.4s, v13.4s\n"
+    "fmla v28.4s, v4.4s, v15.4s\n"
+    "mov v30.16b, v31.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v2.4s, v16.4s\n"
+    "fmla v29.4s, v0.4s, v16.4s\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "add x20, x16, x9\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+    "fmla v29.4s, v5.4s, v12.4s\n"
+    "add x20, x16, x13\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
+    "fmla v28.4s, v5.4s, v13.4s\n"
+    "fmla v29.4s, v3.4s, v13.4s\n"
+    "add x20, x12, XZR\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v14.s }[2], [x20]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr s14, [x20, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v30.4s, v3.4s, v14.4s\n"
+    "add x20, x14, XZR\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr d15, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v15.s }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+    "ldr s15, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+    "fmla v28.4s, v6.4s, v15.4s\n"
+    "fmla v30.4s, v0.4s, v15.4s\n"
+    "add x20, x12, x6\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v30.4s, v4.4s, v11.4s\n"
+    "add x20, x14, x6\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr d16, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v16.s }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+    "ldr s16, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+    "fmla v28.4s, v7.4s, v16.4s\n"
+    "fmla v30.4s, v1.4s, v16.4s\n"
+    "add x20, x12, x11\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v31.4s, v4.4s, v13.4s\n"
+    "add x20, x14, x11\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+    "fmla v29.4s, v7.4s, v12.4s\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "add x20, x12, x9\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v14.s }[2], [x20]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+    "ldr s14, [x20, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+    "fmla v31.4s, v5.4s, v14.4s\n"
+    "add x20, x10, XZR\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr d15, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v15.s }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+    "ldr s15, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+    "fmla v30.4s, v6.4s, v15.4s\n"
+    "add x20, x14, x9\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+    "fmla v29.4s, v8.4s, v11.4s\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "add x20, x10, x6\n"
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+    "fmla v30.4s, v7.4s, v13.4s\n"
+    "add x20, x12, x13\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ldr d16, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 34f\n"
+    "ld1 { v16.s }[2], [x20]\n"
+    "b 34f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr s16, [x20, #0x0]\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v30.4s, v5.4s, v16.4s\n"
+    "fmla v31.4s, v3.4s, v16.4s\n"
+    "add x20, x10, x11\n"
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v14.s }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+    "ldr s14, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+    "fmla v31.4s, v7.4s, v14.4s\n"
+    "add x20, x10, x13\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ldr d15, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 38f\n"
+    "ld1 { v15.s }[2], [x20]\n"
+    "b 38f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+    "ldr s15, [x20, #0x0]\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+    "fmla v30.4s, v8.4s, v15.4s\n"
+    "fmla v31.4s, v6.4s, v15.4s\n"
+    "add x20, x10, x9\n"
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+    "fmla v31.4s, v8.4s, v11.4s\n"
+    "fmax v28.4s, v28.4s, v26.4s\n"
+    "fmax v29.4s, v29.4s, v26.4s\n"
+    "fmax v30.4s, v30.4s, v26.4s\n"
+    "fmax v31.4s, v31.4s, v26.4s\n"
+    "fmin v28.4s, v28.4s, v27.4s\n"
+    "fmin v29.4s, v29.4s, v27.4s\n"
+    "fmin v30.4s, v30.4s, v27.4s\n"
+    "fmin v31.4s, v31.4s, v27.4s\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.d }[0], [x21], x7\n"
+    "st1 { v30.d }[0], [x20], x7\n"
+    "add x17, x17, #0x8\n"
+    "add x28, x28, #0x8\n"
+    "st1 { v29.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 42f\n"
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.s }[2], [x21], x7\n"
+    "st1 { v30.s }[2], [x20], x7\n"
+    "st1 { v29.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "b 42f\n"
+    "41:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.s }[0], [x21], x7\n"
+    "st1 { v30.s }[0], [x20], x7\n"
+    "st1 { v29.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "42:"  // Tile loop: Oddments: Store: Bit 1: End
+    "43:"  // Tile loop: End
+    "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x27, x27, #0x1\n"
+    "add x21, x23, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x27, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x23, x23, x21, LT\n"
+    "csel x27, x27, XZR, LT\n"
+    "cmp x23, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..24fe255dfb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,629 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[25];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[1];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[2];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[7];
+      inptrs[11] = input_ptrs[15];
+      inptrs[12] = input_ptrs[10];
+      inptrs[13] = input_ptrs[16];
+      inptrs[14] = input_ptrs[11];
+      inptrs[15] = input_ptrs[18];
+      inptrs[16] = input_ptrs[13];
+      inptrs[17] = input_ptrs[19];
+      inptrs[18] = input_ptrs[20];
+      inptrs[19] = input_ptrs[14];
+      inptrs[20] = input_ptrs[21];
+      inptrs[21] = input_ptrs[17];
+      inptrs[22] = input_ptrs[23];
+      inptrs[23] = input_ptrs[22];
+      inptrs[24] = input_ptrs[24];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "mov x25, #0x10\n"  // cntb _, ALL, #1
+    "lsr x24, %x[n_channels], #0x2\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v26.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v27.4s }, [x20]\n"
+    "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldp x12, x11, [x21, #0x0]\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "mov x28, #0x0\n"
+    "sub x22, XZR, x25\n"
+    "cbz x24, 3f\n"
+    "ldr q31, [x23, #0x0]\n"
+    "ldr q0, [x23, #0x10]\n"
+    "cmp x25, x24, LSL #4\n"
+    "ldr q1, [x23, #0x20]\n"
+    "ldr q2, [x23, #0x30]\n"
+    "ldr q3, [x23, #0x40]\n"
+    "ldr q4, [x23, #0x50]\n"
+    "ldr q5, [x23, #0x60]\n"
+    "ldr q6, [x23, #0x70]\n"
+    "ldr q7, [x23, #0x80]\n"
+    "ldr q8, [x23, #0x90]\n"
+    "add x23, x23, #0xa0\n"
+    "ldp x21, x20, [x13, #0x0]\n"
+    "ldr q9, [x21, x28]\n"
+    "ldr q10, [x20, x28]\n"
+    "ldp x21, x20, [x13, #0x10]\n"
+    "ldr q11, [x21, x28]\n"
+    "ldr q12, [x20, x28]\n"
+    "ldp x21, x20, [x13, #0x20]\n"
+    "ldr q13, [x21, x28]\n"
+    "ldr q14, [x20, x28]\n"
+    "ldp x21, x20, [x13, #0x30]\n"
+    "ldr q15, [x21, x28]\n"
+    "ldr q16, [x20, x28]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v24.16b, v31.16b\n fmla v24.4s, v8.4s, v9.4s\n"
+    "mov v23.16b, v31.16b\n fmla v23.4s, v6.4s, v9.4s\n"
+    "ldr x21, [x13, #0x40]\n"
+    "ldr x20, [x13, #0x48]\n"
+    "fmla v24.4s, v0.4s, v10.4s\n"
+    "fmla v23.4s, v1.4s, v12.4s\n"
+    "ldr q20, [x20, x28]\n"
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v24.4s, v1.4s, v11.4s\n"
+    "ldr q19, [x21, x28]\n"
+    "fmla v23.4s, v2.4s, v13.4s\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v24.4s, v3.4s, v14.4s\n"
+    "fmla v23.4s, v0.4s, v16.4s\n"
+    "ldr x20, [x13, #0x58]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v24.4s, v4.4s, v15.4s\n"
+    "fmla v23.4s, v4.4s, v19.4s\n"
+    "ldr x21, [x13, #0x78]\n"
+    "ldr x20, [x13, #0x60]\n"
+    "ldr q22, [x20, x28]\n"
+    "fmla v24.4s, v2.4s, v16.4s\n"
+    "fmla v23.4s, v5.4s, v20.4s\n"
+    "ldr x20, [x13, #0x80]\n"
+    "ldr q21, [x20, x28]\n"
+    "mov v20.16b, v31.16b\n fmla v20.4s, v2.4s, v9.4s\n"
+    "mov v19.16b, v31.16b\n fmla v19.4s, v0.4s, v9.4s\n"
+    "ldr q31, [x23, #0x0]\n"
+    "fmla v24.4s, v5.4s, v18.4s\n"
+    "fmla v23.4s, v3.4s, v18.4s\n"
+    "ldr q16, [x21, x28]\n"
+    "ldr x20, [x13, #0x68]\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v20.4s, v3.4s, v17.4s\n"
+    "fmla v19.4s, v4.4s, v16.4s\n"
+    "ldr x20, [x13, #0x88]\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v20.4s, v0.4s, v22.4s\n"
+    "ldr q0, [x23, #0x10]\n"
+    "fmla v19.4s, v1.4s, v21.4s\n"
+    "ldr x20, [x13, #0x70]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v20.4s, v4.4s, v18.4s\n"
+    "fmla v19.4s, v5.4s, v16.4s\n"
+    "ldr q4, [x23, #0x50]\n"
+    "ldr x20, [x13, #0x98]\n"
+    "fmla v24.4s, v6.4s, v22.4s\n"
+    "fmla v20.4s, v1.4s, v17.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr q1, [x23, #0x20]\n"
+    "fmla v19.4s, v2.4s, v16.4s\n"
+    "fmla v24.4s, v7.4s, v17.4s\n"
+    "ldr q2, [x23, #0x30]\n"
+    "ldr x20, [x13, #0x90]\n"
+    "fmla v23.4s, v7.4s, v21.4s\n"
+    "fmla v23.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr x20, [x13, #0xa8]\n"
+    "fmla v20.4s, v6.4s, v16.4s\n"
+    "fmax v24.4s, v24.4s, v26.4s\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x20, [x13, #0xa0]\n"
+    "fmla v19.4s, v3.4s, v17.4s\n"
+    "fmax v23.4s, v23.4s, v26.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr q3, [x23, #0x40]\n"
+    "fmla v20.4s, v7.4s, v16.4s\n"
+    "fmla v20.4s, v5.4s, v17.4s\n"
+    "ldr q5, [x23, #0x60]\n"
+    "ldr x20, [x13, #0xb0]\n"
+    "add x22, x22, #0x10\n"
+    "fmin v24.4s, v24.4s, v27.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr x20, [x13, #0xb8]\n"
+    "fmla v19.4s, v7.4s, v16.4s\n"
+    "fmin v23.4s, v23.4s, v27.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr q7, [x23, #0x80]\n"
+    "fmla v19.4s, v6.4s, v16.4s\n"
+    "fmla v20.4s, v8.4s, v16.4s\n"
+    "ldr q6, [x23, #0x70]\n"
+    "ldr x20, [x13, #0xc0]\n"
+    "fmax v20.4s, v20.4s, v26.4s\n"
+    "fmin v20.4s, v20.4s, v27.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v19.4s, v8.4s, v16.4s\n"
+    "ldr q8, [x23, #0x90]\n"
+    "fmax v19.4s, v19.4s, v26.4s\n"
+    "ldp x21, x20, [x13, #0x0]\n"
+    "ldr q9, [x21, x25]\n"
+    "fmin v19.4s, v19.4s, v27.4s\n"
+    "add x28, x28, #0x10\n"
+    "ldr q10, [x20, x25]\n"
+    "ldp x21, x20, [x13, #0x10]\n"
+    "str q24, [x12, x22]\n"
+    "add x23, x23, #0xa0\n"
+    "ldr q11, [x21, x25]\n"
+    "ldr q12, [x20, x25]\n"
+    "str q23, [x11, x22]\n"
+    "ldp x21, x20, [x13, #0x20]\n"
+    "ldr q13, [x21, x25]\n"
+    "str q20, [x10, x22]\n"
+    "ldr q14, [x20, x25]\n"
+    "ldp x21, x20, [x13, #0x30]\n"
+    "str q19, [x9, x22]\n"
+    "ldr q15, [x21, x25]\n"
+    "ldr q16, [x20, x25]\n"
+    "add x25, x25, #0x10\n"
+    "cmp x25, x24, LSL #4\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v25.16b, v31.16b\n fmla v25.4s, v8.4s, v9.4s\n"
+    "mov v24.16b, v31.16b\n fmla v24.4s, v6.4s, v9.4s\n"
+    "ldr x21, [x13, #0x40]\n"
+    "ldr x20, [x13, #0x48]\n"
+    "fmla v25.4s, v0.4s, v10.4s\n"
+    "fmla v24.4s, v1.4s, v12.4s\n"
+    "ldr q20, [x20, x28]\n"
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v25.4s, v1.4s, v11.4s\n"
+    "ldr q18, [x21, x28]\n"
+    "fmla v24.4s, v2.4s, v13.4s\n"
+    "ldr q19, [x20, x28]\n"
+    "fmla v25.4s, v3.4s, v14.4s\n"
+    "fmla v24.4s, v0.4s, v16.4s\n"
+    "ldr x20, [x13, #0x58]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v25.4s, v4.4s, v15.4s\n"
+    "fmla v24.4s, v4.4s, v18.4s\n"
+    "ldr x21, [x13, #0x78]\n"
+    "ldr x20, [x13, #0x60]\n"
+    "ldr q23, [x20, x28]\n"
+    "fmla v25.4s, v2.4s, v16.4s\n"
+    "fmla v24.4s, v5.4s, v20.4s\n"
+    "ldr x20, [x13, #0x80]\n"
+    "ldr q22, [x20, x28]\n"
+    "mov v21.16b, v31.16b\n fmla v21.4s, v2.4s, v9.4s\n"
+    "mov v20.16b, v31.16b\n fmla v20.4s, v0.4s, v9.4s\n"
+    "ldr x20, [x13, #0x68]\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v25.4s, v5.4s, v19.4s\n"
+    "fmla v24.4s, v3.4s, v19.4s\n"
+    "ldr q16, [x21, x28]\n"
+    "fmla v21.4s, v3.4s, v17.4s\n"
+    "fmla v20.4s, v4.4s, v16.4s\n"
+    "ldr x20, [x13, #0x88]\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v21.4s, v0.4s, v23.4s\n"
+    "fmla v20.4s, v1.4s, v22.4s\n"
+    "ldr x20, [x13, #0x70]\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x20, [x13, #0x98]\n"
+    "fmla v21.4s, v4.4s, v18.4s\n"
+    "ldr q19, [x20, x28]\n"
+    "fmla v20.4s, v5.4s, v16.4s\n"
+    "fmla v25.4s, v6.4s, v23.4s\n"
+    "ldr x20, [x13, #0x90]\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v21.4s, v1.4s, v17.4s\n"
+    "ldr x20, [x13, #0xa8]\n"
+    "fmla v20.4s, v2.4s, v19.4s\n"
+    "fmla v25.4s, v7.4s, v17.4s\n"
+    "ldr q18, [x20, x28]\n"
+    "ldr x20, [x13, #0xa0]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v21.4s, v6.4s, v16.4s\n"
+    "fmla v20.4s, v3.4s, v18.4s\n"
+    "ldr x20, [x13, #0xb0]\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v21.4s, v7.4s, v17.4s\n"
+    "fmla v20.4s, v7.4s, v16.4s\n"
+    "ldr x20, [x13, #0xb8]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v24.4s, v7.4s, v22.4s\n"
+    "fmla v21.4s, v5.4s, v18.4s\n"
+    "ldr x20, [x13, #0xc0]\n"
+    "fmla v20.4s, v6.4s, v17.4s\n"
+    "fmla v24.4s, v8.4s, v19.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v21.4s, v8.4s, v17.4s\n"
+    "fmla v20.4s, v8.4s, v16.4s\n"
+    "fmax v25.4s, v25.4s, v26.4s\n"
+    "add x22, x22, #0x10\n"
+    "fmax v24.4s, v24.4s, v26.4s\n"
+    "fmax v21.4s, v21.4s, v26.4s\n"
+    "add x28, x28, #0x10\n"
+    "fmax v20.4s, v20.4s, v26.4s\n"
+    "fmin v25.4s, v25.4s, v27.4s\n"
+    "str q25, [x12, x22]\n"
+    "fmin v24.4s, v24.4s, v27.4s\n"
+    "fmin v21.4s, v21.4s, v27.4s\n"
+    "str q24, [x11, x22]\n"
+    "fmin v20.4s, v20.4s, v27.4s\n"
+    "str q21, [x10, x22]\n"
+    "str q20, [x9, x22]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 42f\n"
+    "ldr q31, [x23, #0x0]\n"
+    "ldr q0, [x23, #0x10]\n"
+    "mov x20, x28\n"
+    "add x12, x12, x20\n"
+    "ldr q1, [x23, #0x20]\n"
+    "ldr q2, [x23, #0x30]\n"
+    "add x11, x11, x20\n"
+    "add x10, x10, x20\n"
+    "ldr q3, [x23, #0x40]\n"
+    "ldr q4, [x23, #0x50]\n"
+    "add x9, x9, x20\n"
+    "ldr q5, [x23, #0x60]\n"
+    "ldr q6, [x23, #0x70]\n"
+    "ldr q7, [x23, #0x80]\n"
+    "ldr q8, [x23, #0x90]\n"
+    "ldr x27, [x13, #0x0]\n"
+    "ldr x26, [x13, #0x8]\n"
+    "add x27, x27, x28\n"
+    "add x26, x26, x28\n"
+    "ldr x25, [x13, #0x10]\n"
+    "ldr x24, [x13, #0x18]\n"
+    "add x25, x25, x28\n"
+    "add x24, x24, x28\n"
+    "ldr x23, [x13, #0x20]\n"
+    "ldr x22, [x13, #0x28]\n"
+    "add x23, x23, x28\n"
+    "add x22, x22, x28\n"
+    "ldr x21, [x13, #0x30]\n"
+    "ldr x20, [x13, #0x38]\n"
+    "add x21, x21, x28\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.d }[0], [x27], #0x8\n"
+    "ld1 { v10.d }[0], [x26], #0x8\n"
+    "ld1 { v11.d }[0], [x25], #0x8\n"
+    "ld1 { v12.d }[0], [x24], #0x8\n"
+    "ld1 { v13.d }[0], [x23], #0x8\n"
+    "ld1 { v14.d }[0], [x22], #0x8\n"
+    "ld1 { v15.d }[0], [x21], #0x8\n"
+    "ld1 { v16.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v9.s }[2], [x27], #0x4\n"
+    "ld1 { v10.s }[2], [x26], #0x4\n"
+    "ld1 { v11.s }[2], [x25], #0x4\n"
+    "ld1 { v12.s }[2], [x24], #0x4\n"
+    "ld1 { v13.s }[2], [x23], #0x4\n"
+    "ld1 { v14.s }[2], [x22], #0x4\n"
+    "ld1 { v15.s }[2], [x21], #0x4\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
+    "ld1 { v9.s }[0], [x27], #0x4\n"
+    "ld1 { v10.s }[0], [x26], #0x4\n"
+    "ld1 { v11.s }[0], [x25], #0x4\n"
+    "ld1 { v12.s }[0], [x24], #0x4\n"
+    "ld1 { v13.s }[0], [x23], #0x4\n"
+    "ld1 { v14.s }[0], [x22], #0x4\n"
+    "ld1 { v15.s }[0], [x21], #0x4\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
+    "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "ldr x20, [x13, #0x40]\n"
+    "add x20, x20, x28\n"
+    "mov v29.16b, v31.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "fmla v28.4s, v3.4s, v14.4s\n"
+    "fmla v29.4s, v2.4s, v13.4s\n"
+    "fmla v28.4s, v4.4s, v15.4s\n"
+    "mov v30.16b, v31.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v2.4s, v16.4s\n"
+    "fmla v29.4s, v0.4s, v16.4s\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "7:"  // Oddments: Load input (1, 3): Bit 1: End
+    "ldr x20, [x13, #0x48]\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (1, 4): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "9:"  // Oddments: Load input (1, 4): Bit 1: End
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v29.4s, v5.4s, v12.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (1, 2): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "11:"  // Oddments: Load input (1, 2): Bit 1: End
+    "ldr x20, [x13, #0x58]\n"
+    "fmla v28.4s, v5.4s, v13.4s\n"
+    "fmla v29.4s, v3.4s, v13.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "13:"  // Oddments: Load input (3, 0): Bit 1: End
+    "ldr x20, [x13, #0x60]\n"
+    "fmla v30.4s, v3.4s, v14.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 0): Bit 1: Unset
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "15:"  // Oddments: Load input (2, 0): Bit 1: End
+    "ldr x20, [x13, #0x68]\n"
+    "fmla v28.4s, v6.4s, v15.4s\n"
+    "fmla v30.4s, v0.4s, v15.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "17:"  // Oddments: Load input (3, 1): Bit 1: End
+    "ldr x20, [x13, #0x70]\n"
+    "fmla v30.4s, v4.4s, v11.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v16.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (2, 1): Bit 1: Unset
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "19:"  // Oddments: Load input (2, 1): Bit 1: End
+    "ldr x20, [x13, #0x78]\n"
+    "fmla v28.4s, v7.4s, v16.4s\n"
+    "fmla v30.4s, v1.4s, v16.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "21:"  // Oddments: Load input (3, 3): Bit 1: End
+    "ldr x20, [x13, #0x80]\n"
+    "fmla v31.4s, v4.4s, v13.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (2, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "23:"  // Oddments: Load input (2, 3): Bit 1: End
+    "ldr x20, [x13, #0x88]\n"
+    "fmla v29.4s, v7.4s, v12.4s\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (3, 4): Bit 1: Unset
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "25:"  // Oddments: Load input (3, 4): Bit 1: End
+    "ldr x20, [x13, #0x90]\n"
+    "fmla v31.4s, v5.4s, v14.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (4, 0): Bit 1: Unset
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "27:"  // Oddments: Load input (4, 0): Bit 1: End
+    "ldr x20, [x13, #0x98]\n"
+    "fmla v30.4s, v6.4s, v15.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 29f\n"
+    "28:"  // Oddments: Load input (2, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "29:"  // Oddments: Load input (2, 4): Bit 1: End
+    "ldr x20, [x13, #0xa0]\n"
+    "fmla v29.4s, v8.4s, v11.4s\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (4, 1): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "31:"  // Oddments: Load input (4, 1): Bit 1: End
+    "ldr x20, [x13, #0xa8]\n"
+    "fmla v30.4s, v7.4s, v13.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v16.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 33f\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "b 33f\n"
+    "32:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "33:"  // Oddments: Load input (3, 2): Bit 1: End
+    "ldr x20, [x13, #0xb0]\n"
+    "fmla v30.4s, v5.4s, v16.4s\n"
+    "fmla v31.4s, v3.4s, v16.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (4, 3): Bit 1: Unset
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "35:"  // Oddments: Load input (4, 3): Bit 1: End
+    "ldr x20, [x13, #0xb8]\n"
+    "fmla v31.4s, v7.4s, v14.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 37f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "b 37f\n"
+    "36:"  // Oddments: Load input (4, 2): Bit 1: Unset
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "37:"  // Oddments: Load input (4, 2): Bit 1: End
+    "ldr x20, [x13, #0xc0]\n"
+    "fmla v30.4s, v8.4s, v15.4s\n"
+    "fmla v31.4s, v6.4s, v15.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (4, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "39:"  // Oddments: Load input (4, 4): Bit 1: End
+    "fmla v31.4s, v8.4s, v11.4s\n"
+    "fmax v28.4s, v28.4s, v26.4s\n"
+    "fmax v29.4s, v29.4s, v26.4s\n"
+    "fmax v30.4s, v30.4s, v26.4s\n"
+    "fmax v31.4s, v31.4s, v26.4s\n"
+    "fmin v28.4s, v28.4s, v27.4s\n"
+    "fmin v29.4s, v29.4s, v27.4s\n"
+    "fmin v30.4s, v30.4s, v27.4s\n"
+    "fmin v31.4s, v31.4s, v27.4s\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "st1 { v28.d }[0], [x12], #0x8\n"
+    "st1 { v29.d }[0], [x11], #0x8\n"
+    "st1 { v30.d }[0], [x10], #0x8\n"
+    "st1 { v31.d }[0], [x9], #0x8\n"
+    "tbz %x[n_channels], #0, 41f\n"
+    "st1 { v28.s }[2], [x12], #0x4\n"
+    "st1 { v29.s }[2], [x11], #0x4\n"
+    "st1 { v30.s }[2], [x10], #0x4\n"
+    "st1 { v31.s }[2], [x9], #0x4\n"
+    "b 41f\n"
+    "40:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v28.s }[0], [x12], #0x4\n"
+    "st1 { v29.s }[0], [x11], #0x4\n"
+    "st1 { v30.s }[0], [x10], #0x4\n"
+    "st1 { v31.s }[0], [x9], #0x4\n"
+    "41:"  // Oddments: Store: Bit 1: End
+    "42:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..de8a1e4514
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..3426fbc3f9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,991 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x27, #0x0\n"
+    "mov x26, #0x0\n"
+    "1:"  // Tile loop
+    "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x23, #0x2\n"
+    "mov x25, #0x2\n"
+    "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x27, x24\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x26, x2, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x3, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "lsl x2, x2, #0x2\n"
+    "mul x20, x27, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x4, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x6, x2, x2\n"
+    "mul x22, x22, x23\n"  // offset *= kernel_stride * output_size
+    "add x4, x4, x22, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x7, x4, x24, LSL #2\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x26, x3, x20\n"  // offset += tile_j * ld_output_col
+    "add x17, x7, x24, LSL #2\n"
+    "mov x23, #0x10\n"  // cntb _, ALL, #1
+    "mul x20, x20, x25\n"  // offset *= output_tile_size
+    "lsr x22, %x[n_channels], #0x2\n"
+    "add x16, x17, x24, LSL #2\n"
+    "add x15, x6, x2\n"
+    "add x14, x16, x24, LSL #2\n"
+    "add x13, x15, x2\n"
+    "add x5, x5, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v27.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "add x12, x14, x24, LSL #2\n"
+    "add x11, x13, x2\n"
+    "add x10, x5, x21, LSL #2\n"
+    "lsl x3, x3, #0x2\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x23\n"
+    "cbz x22, 4f\n"
+    "ldr q25, [x8, #0x0]\n"
+    "ldr q0, [x8, #0x10]\n"
+    "cmp x23, x22, LSL #4\n"
+    "ldr q1, [x8, #0x20]\n"
+    "ldr q2, [x8, #0x30]\n"
+    "ldr q3, [x8, #0x40]\n"
+    "ldr q4, [x8, #0x50]\n"
+    "add x8, x8, #0x60\n"
+    "ld1 { v5.4s }, [x4]\n"
+    "ldr q6, [x4, x2]\n"
+    "ld1 { v7.4s }, [x7]\n"
+    "ldr q8, [x7, x2]\n"
+    "ldr q9, [x4, x6]\n"
+    "ldr q13, [x7, x6]\n"
+    "ldr q11, [x4, x15]\n"
+    "ldr q12, [x4, x13]\n"
+    "ldr q10, [x7, x11]\n"
+    "ld1 { v14.4s }, [x17]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v5.4s\n"
+    "ldr q23, [x7, x15]\n"
+    "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v6.4s\n"
+    "add x23, x23, #0x10\n"
+    "mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+    "mov v28.16b, v25.16b\n fmla v28.4s, v0.4s, v8.4s\n"
+    "ldr q19, [x8, #0x0]\n"
+    "ldr q25, [x8, #0x140]\n"
+    "fmla v30.4s, v1.4s, v6.4s\n"
+    "ldr q21, [x7, x13]\n"
+    "fmla v31.4s, v1.4s, v9.4s\n"
+    "add x7, x7, #0x10\n"
+    "fmla v29.4s, v1.4s, v8.4s\n"
+    "fmla v28.4s, v1.4s, v13.4s\n"
+    "ldr q1, [x8, #0x10]\n"
+    "cmp x23, x22, LSL #4\n"
+    "fmla v30.4s, v2.4s, v9.4s\n"
+    "ldr q18, [x4, x11]\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "add x4, x4, #0x10\n"
+    "fmla v29.4s, v2.4s, v13.4s\n"
+    "fmla v28.4s, v2.4s, v23.4s\n"
+    "ldr q17, [x8, #0x20]\n"
+    "add x20, x20, #0x10\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "ldr q6, [x17, x2]\n"
+    "fmla v31.4s, v3.4s, v12.4s\n"
+    "add x21, x21, #0x10\n"
+    "fmla v29.4s, v3.4s, v23.4s\n"
+    "fmla v28.4s, v3.4s, v21.4s\n"
+    "ldr q16, [x8, #0x30]\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "ldr q2, [x17, x6]\n"
+    "fmla v31.4s, v4.4s, v18.4s\n"
+    "ldr q0, [x17, x15]\n"
+    "fmla v29.4s, v4.4s, v21.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "ldr q20, [x8, #0x40]\n"
+    "fmla v30.4s, v19.4s, v7.4s\n"
+    "ld1 { v7.4s }, [x7]\n"
+    "fmla v31.4s, v19.4s, v8.4s\n"
+    "fmla v29.4s, v19.4s, v14.4s\n"
+    "fmla v28.4s, v19.4s, v6.4s\n"
+    "ldr q19, [x8, #0x50]\n"
+    "fmla v30.4s, v1.4s, v8.4s\n"
+    "ldr q26, [x17, x11]\n"
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "fmla v29.4s, v1.4s, v6.4s\n"
+    "fmla v28.4s, v1.4s, v2.4s\n"
+    "ldr q18, [x8, #0x60]\n"
+    "fmla v30.4s, v17.4s, v13.4s\n"
+    "ldr q1, [x17, x13]\n"
+    "fmla v31.4s, v17.4s, v23.4s\n"
+    "add x17, x17, #0x10\n"
+    "fmla v29.4s, v17.4s, v2.4s\n"
+    "fmla v28.4s, v17.4s, v0.4s\n"
+    "ldr q17, [x8, #0x70]\n"
+    "fmla v30.4s, v16.4s, v23.4s\n"
+    "ld1 { v24.4s }, [x16]\n"
+    "fmla v31.4s, v16.4s, v21.4s\n"
+    "fmla v29.4s, v16.4s, v0.4s\n"
+    "fmla v28.4s, v16.4s, v1.4s\n"
+    "ldr q16, [x8, #0x80]\n"
+    "fmla v30.4s, v20.4s, v21.4s\n"
+    "ldr q23, [x16, x2]\n"
+    "fmla v31.4s, v20.4s, v10.4s\n"
+    "ldr q22, [x16, x6]\n"
+    "fmla v29.4s, v20.4s, v1.4s\n"
+    "fmla v28.4s, v20.4s, v26.4s\n"
+    "ldr q21, [x8, #0x90]\n"
+    "fmla v30.4s, v19.4s, v14.4s\n"
+    "ldr q5, [x16, x11]\n"
+    "fmla v31.4s, v19.4s, v6.4s\n"
+    "fmla v29.4s, v19.4s, v24.4s\n"
+    "fmla v28.4s, v19.4s, v23.4s\n"
+    "ldr q11, [x8, #0xa0]\n"
+    "fmla v30.4s, v18.4s, v6.4s\n"
+    "ldr q20, [x16, x15]\n"
+    "fmla v31.4s, v18.4s, v2.4s\n"
+    "fmla v29.4s, v18.4s, v23.4s\n"
+    "fmla v28.4s, v18.4s, v22.4s\n"
+    "ldr q18, [x8, #0xb0]\n"
+    "fmla v30.4s, v17.4s, v2.4s\n"
+    "ldr q19, [x16, x13]\n"
+    "fmla v31.4s, v17.4s, v0.4s\n"
+    "add x16, x16, #0x10\n"
+    "fmla v29.4s, v17.4s, v22.4s\n"
+    "fmla v28.4s, v17.4s, v20.4s\n"
+    "ldr q17, [x8, #0xc0]\n"
+    "fmla v30.4s, v16.4s, v0.4s\n"
+    "ld1 { v0.4s }, [x14]\n"
+    "fmla v31.4s, v16.4s, v1.4s\n"
+    "fmla v29.4s, v16.4s, v20.4s\n"
+    "fmla v28.4s, v16.4s, v19.4s\n"
+    "ldr q16, [x8, #0xd0]\n"
+    "fmla v30.4s, v21.4s, v1.4s\n"
+    "ldr q4, [x14, x2]\n"
+    "fmla v31.4s, v21.4s, v26.4s\n"
+    "ldr q12, [x14, x13]\n"
+    "fmla v29.4s, v21.4s, v19.4s\n"
+    "fmla v28.4s, v21.4s, v5.4s\n"
+    "ldr q13, [x8, #0xe0]\n"
+    "fmla v30.4s, v11.4s, v24.4s\n"
+    "ldr q6, [x14, x6]\n"
+    "fmla v31.4s, v11.4s, v23.4s\n"
+    "fmla v29.4s, v11.4s, v0.4s\n"
+    "fmla v28.4s, v11.4s, v4.4s\n"
+    "ldr q24, [x8, #0xf0]\n"
+    "fmla v30.4s, v18.4s, v23.4s\n"
+    "ldr q26, [x14, x15]\n"
+    "fmla v31.4s, v18.4s, v22.4s\n"
+    "fmla v29.4s, v18.4s, v4.4s\n"
+    "fmla v28.4s, v18.4s, v6.4s\n"
+    "ldr q23, [x8, #0x100]\n"
+    "fmla v30.4s, v17.4s, v22.4s\n"
+    "ldr q22, [x14, x11]\n"
+    "fmla v31.4s, v17.4s, v20.4s\n"
+    "add x14, x14, #0x10\n"
+    "fmla v29.4s, v17.4s, v6.4s\n"
+    "fmla v28.4s, v17.4s, v26.4s\n"
+    "ldr q21, [x8, #0x110]\n"
+    "fmla v30.4s, v16.4s, v20.4s\n"
+    "ld1 { v18.4s }, [x12]\n"
+    "fmla v31.4s, v16.4s, v19.4s\n"
+    "fmla v29.4s, v16.4s, v26.4s\n"
+    "fmla v28.4s, v16.4s, v12.4s\n"
+    "ldr q20, [x8, #0x120]\n"
+    "fmla v30.4s, v13.4s, v19.4s\n"
+    "ldr q17, [x12, x2]\n"
+    "fmla v31.4s, v13.4s, v5.4s\n"
+    "ld1 { v14.4s }, [x17]\n"
+    "fmla v29.4s, v13.4s, v12.4s\n"
+    "fmla v28.4s, v13.4s, v22.4s\n"
+    "ldr q19, [x8, #0x130]\n"
+    "fmla v30.4s, v24.4s, v0.4s\n"
+    "ldr q16, [x12, x6]\n"
+    "fmla v31.4s, v24.4s, v4.4s\n"
+    "fmla v29.4s, v24.4s, v18.4s\n"
+    "ldr q18, [x12, x15]\n"
+    "fmla v28.4s, v24.4s, v17.4s\n"
+    "ldr q0, [x8, #0x150]\n"
+    "fmla v30.4s, v23.4s, v4.4s\n"
+    "ldr q13, [x7, x6]\n"
+    "fmla v31.4s, v23.4s, v6.4s\n"
+    "fmla v29.4s, v23.4s, v17.4s\n"
+    "ldr q17, [x12, x13]\n"
+    "fmla v28.4s, v23.4s, v16.4s\n"
+    "ldr q1, [x8, #0x160]\n"
+    "fmla v30.4s, v21.4s, v6.4s\n"
+    "ld1 { v5.4s }, [x4]\n"
+    "fmla v31.4s, v21.4s, v26.4s\n"
+    "fmla v29.4s, v21.4s, v16.4s\n"
+    "ldr q16, [x12, x11]\n"
+    "fmla v28.4s, v21.4s, v18.4s\n"
+    "ldr q2, [x8, #0x170]\n"
+    "fmla v30.4s, v20.4s, v26.4s\n"
+    "ldr q6, [x4, x2]\n"
+    "fmla v31.4s, v20.4s, v12.4s\n"
+    "add x12, x12, #0x10\n"
+    "fmla v29.4s, v20.4s, v18.4s\n"
+    "ldr q11, [x4, x15]\n"
+    "fmla v28.4s, v20.4s, v17.4s\n"
+    "ldr q3, [x8, #0x180]\n"
+    "fmla v30.4s, v19.4s, v12.4s\n"
+    "ldr q8, [x7, x2]\n"
+    "fmla v31.4s, v19.4s, v22.4s\n"
+    "ldr q10, [x7, x11]\n"
+    "fmla v29.4s, v19.4s, v17.4s\n"
+    "ldr q12, [x4, x13]\n"
+    "fmla v28.4s, v19.4s, v16.4s\n"
+    "ldr q9, [x4, x6]\n"
+    "ldr q4, [x8, #0x190]\n"
+    "fmax v30.4s, v30.4s, v27.4s\n"
+    "fmax v31.4s, v31.4s, v27.4s\n"
+    "add x8, x8, #0x1a0\n"
+    "fmax v29.4s, v29.4s, v27.4s\n"
+    "fmax v28.4s, v28.4s, v27.4s\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "st1 { v30.4s }, [x5]\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "fmin v28.4s, v28.4s, v15.4s\n"
+    "str q31, [x5, x3]\n"
+    "add x5, x5, #0x10\n"
+    "st1 { v29.4s }, [x10]\n"
+    "str q28, [x10, x3]\n"
+    "add x10, x10, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+    "ldr q22, [x7, x15]\n"
+    "mov v5.16b, v25.16b\n fmla v5.4s, v0.4s, v6.4s\n"
+    "mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+    "mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v8.4s\n"
+    "ldr q19, [x8, #0x0]\n"
+    "fmla v31.4s, v1.4s, v6.4s\n"
+    "ldr q21, [x7, x13]\n"
+    "fmla v5.4s, v1.4s, v9.4s\n"
+    "add x7, x7, #0x10\n"
+    "fmla v30.4s, v1.4s, v8.4s\n"
+    "fmla v29.4s, v1.4s, v13.4s\n"
+    "ldr q18, [x8, #0x10]\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "ldr q16, [x4, x11]\n"
+    "fmla v5.4s, v2.4s, v11.4s\n"
+    "add x4, x4, #0x10\n"
+    "fmla v30.4s, v2.4s, v13.4s\n"
+    "fmla v29.4s, v2.4s, v22.4s\n"
+    "ldr q17, [x8, #0x20]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr q6, [x17, x2]\n"
+    "fmla v5.4s, v3.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v22.4s\n"
+    "fmla v29.4s, v3.4s, v21.4s\n"
+    "ldr q20, [x8, #0x30]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "ldr q2, [x17, x6]\n"
+    "fmla v5.4s, v4.4s, v16.4s\n"
+    "ldr q28, [x17, x15]\n"
+    "fmla v30.4s, v4.4s, v21.4s\n"
+    "fmla v29.4s, v4.4s, v10.4s\n"
+    "ldr q16, [x8, #0x40]\n"
+    "fmla v31.4s, v19.4s, v7.4s\n"
+    "fmla v5.4s, v19.4s, v8.4s\n"
+    "fmla v30.4s, v19.4s, v14.4s\n"
+    "fmla v29.4s, v19.4s, v6.4s\n"
+    "ldr q19, [x8, #0x50]\n"
+    "fmla v31.4s, v18.4s, v8.4s\n"
+    "ldr q1, [x17, x11]\n"
+    "fmla v5.4s, v18.4s, v13.4s\n"
+    "fmla v30.4s, v18.4s, v6.4s\n"
+    "fmla v29.4s, v18.4s, v2.4s\n"
+    "ldr q18, [x8, #0x60]\n"
+    "fmla v31.4s, v17.4s, v13.4s\n"
+    "ldr q26, [x17, x13]\n"
+    "fmla v5.4s, v17.4s, v22.4s\n"
+    "add x17, x17, #0x10\n"
+    "fmla v30.4s, v17.4s, v2.4s\n"
+    "fmla v29.4s, v17.4s, v28.4s\n"
+    "ldr q17, [x8, #0x70]\n"
+    "fmla v31.4s, v20.4s, v22.4s\n"
+    "ld1 { v25.4s }, [x16]\n"
+    "fmla v5.4s, v20.4s, v21.4s\n"
+    "fmla v30.4s, v20.4s, v28.4s\n"
+    "fmla v29.4s, v20.4s, v26.4s\n"
+    "ldr q24, [x8, #0x80]\n"
+    "fmla v31.4s, v16.4s, v21.4s\n"
+    "ldr q23, [x16, x2]\n"
+    "fmla v5.4s, v16.4s, v10.4s\n"
+    "ldr q0, [x16, x6]\n"
+    "fmla v30.4s, v16.4s, v26.4s\n"
+    "fmla v29.4s, v16.4s, v1.4s\n"
+    "ldr q22, [x8, #0x90]\n"
+    "fmla v31.4s, v19.4s, v14.4s\n"
+    "ldr q16, [x16, x11]\n"
+    "fmla v5.4s, v19.4s, v6.4s\n"
+    "fmla v30.4s, v19.4s, v25.4s\n"
+    "fmla v29.4s, v19.4s, v23.4s\n"
+    "ldr q21, [x8, #0xa0]\n"
+    "fmla v31.4s, v18.4s, v6.4s\n"
+    "ldr q20, [x16, x15]\n"
+    "fmla v5.4s, v18.4s, v2.4s\n"
+    "fmla v30.4s, v18.4s, v23.4s\n"
+    "fmla v29.4s, v18.4s, v0.4s\n"
+    "ldr q18, [x8, #0xb0]\n"
+    "fmla v31.4s, v17.4s, v2.4s\n"
+    "ldr q19, [x16, x13]\n"
+    "fmla v5.4s, v17.4s, v28.4s\n"
+    "add x16, x16, #0x10\n"
+    "fmla v30.4s, v17.4s, v0.4s\n"
+    "fmla v29.4s, v17.4s, v20.4s\n"
+    "ldr q17, [x8, #0xc0]\n"
+    "fmla v31.4s, v24.4s, v28.4s\n"
+    "ld1 { v7.4s }, [x14]\n"
+    "fmla v5.4s, v24.4s, v26.4s\n"
+    "fmla v30.4s, v24.4s, v20.4s\n"
+    "fmla v29.4s, v24.4s, v19.4s\n"
+    "ldr q2, [x8, #0xd0]\n"
+    "fmla v31.4s, v22.4s, v26.4s\n"
+    "ldr q28, [x14, x2]\n"
+    "fmla v5.4s, v22.4s, v1.4s\n"
+    "ldr q13, [x14, x13]\n"
+    "fmla v30.4s, v22.4s, v19.4s\n"
+    "fmla v29.4s, v22.4s, v16.4s\n"
+    "ldr q14, [x8, #0xe0]\n"
+    "fmla v31.4s, v21.4s, v25.4s\n"
+    "ldr q26, [x14, x6]\n"
+    "fmla v5.4s, v21.4s, v23.4s\n"
+    "fmla v30.4s, v21.4s, v7.4s\n"
+    "fmla v29.4s, v21.4s, v28.4s\n"
+    "ldr q25, [x8, #0xf0]\n"
+    "fmla v31.4s, v18.4s, v23.4s\n"
+    "ldr q24, [x14, x15]\n"
+    "fmla v5.4s, v18.4s, v0.4s\n"
+    "fmla v30.4s, v18.4s, v28.4s\n"
+    "fmla v29.4s, v18.4s, v26.4s\n"
+    "ldr q23, [x8, #0x100]\n"
+    "fmla v31.4s, v17.4s, v0.4s\n"
+    "ldr q22, [x14, x11]\n"
+    "fmla v5.4s, v17.4s, v20.4s\n"
+    "add x14, x14, #0x10\n"
+    "fmla v30.4s, v17.4s, v26.4s\n"
+    "fmla v29.4s, v17.4s, v24.4s\n"
+    "ldr q21, [x8, #0x110]\n"
+    "fmla v31.4s, v2.4s, v20.4s\n"
+    "ld1 { v18.4s }, [x12]\n"
+    "fmla v5.4s, v2.4s, v19.4s\n"
+    "fmla v30.4s, v2.4s, v24.4s\n"
+    "fmla v29.4s, v2.4s, v13.4s\n"
+    "ldr q20, [x8, #0x120]\n"
+    "fmla v31.4s, v14.4s, v19.4s\n"
+    "ldr q17, [x12, x2]\n"
+    "fmla v5.4s, v14.4s, v16.4s\n"
+    "fmla v30.4s, v14.4s, v13.4s\n"
+    "fmla v29.4s, v14.4s, v22.4s\n"
+    "ldr q19, [x8, #0x130]\n"
+    "add x8, x8, #0x140\n"
+    "fmla v31.4s, v25.4s, v7.4s\n"
+    "ldr q16, [x12, x6]\n"
+    "fmla v5.4s, v25.4s, v28.4s\n"
+    "fmla v30.4s, v25.4s, v18.4s\n"
+    "ldr q18, [x12, x15]\n"
+    "fmla v29.4s, v25.4s, v17.4s\n"
+    "fmla v31.4s, v23.4s, v28.4s\n"
+    "fmla v5.4s, v23.4s, v26.4s\n"
+    "fmla v30.4s, v23.4s, v17.4s\n"
+    "ldr q17, [x12, x13]\n"
+    "fmla v29.4s, v23.4s, v16.4s\n"
+    "fmla v31.4s, v21.4s, v26.4s\n"
+    "fmla v5.4s, v21.4s, v24.4s\n"
+    "fmla v30.4s, v21.4s, v16.4s\n"
+    "ldr q16, [x12, x11]\n"
+    "fmla v29.4s, v21.4s, v18.4s\n"
+    "add x12, x12, #0x10\n"
+    "fmla v31.4s, v20.4s, v24.4s\n"
+    "fmla v5.4s, v20.4s, v13.4s\n"
+    "fmla v30.4s, v20.4s, v18.4s\n"
+    "fmla v29.4s, v20.4s, v17.4s\n"
+    "fmla v31.4s, v19.4s, v13.4s\n"
+    "fmla v5.4s, v19.4s, v22.4s\n"
+    "fmax v31.4s, v31.4s, v27.4s\n"
+    "fmla v30.4s, v19.4s, v17.4s\n"
+    "fmla v29.4s, v19.4s, v16.4s\n"
+    "fmax v5.4s, v5.4s, v27.4s\n"
+    "fmax v30.4s, v30.4s, v27.4s\n"
+    "fmax v29.4s, v29.4s, v27.4s\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "fmin v5.4s, v5.4s, v15.4s\n"
+    "st1 { v31.4s }, [x5]\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "str q5, [x5, x3]\n"
+    "add x5, x5, #0x10\n"
+    "st1 { v30.4s }, [x10]\n"
+    "str q29, [x10, x3]\n"
+    "add x10, x10, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 61f\n"
+    "ldr q25, [x8, #0x0]\n"
+    "ldr q0, [x8, #0x10]\n"
+    "add x9, x4, XZR\n"
+    "add x28, x4, x2\n"
+    "ldr q1, [x8, #0x20]\n"
+    "ldr q2, [x8, #0x30]\n"
+    "add x27, x7, XZR\n"
+    "add x26, x7, x2\n"
+    "ldr q3, [x8, #0x40]\n"
+    "ldr q4, [x8, #0x50]\n"
+    "add x25, x4, x6\n"
+    "add x24, x7, x6\n"
+    "add x23, x4, x15\n"
+    "add x22, x4, x13\n"
+    "add x21, x7, x11\n"
+    "add x20, x17, XZR\n"
+    "add x8, x8, #0x60\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr d5, [x9], #0x8\n"
+    "ldr d6, [x28], #0x8\n"
+    "ldr d7, [x27], #0x8\n"
+    "ldr d8, [x26], #0x8\n"
+    "ldr d9, [x25], #0x8\n"
+    "ldr d13, [x24], #0x8\n"
+    "ldr d11, [x23], #0x8\n"
+    "ldr d12, [x22], #0x8\n"
+    "ldr d10, [x21], #0x8\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v5.s }[2], [x9]\n"
+    "ld1 { v6.s }[2], [x28]\n"
+    "ld1 { v7.s }[2], [x27]\n"
+    "ld1 { v8.s }[2], [x26]\n"
+    "ld1 { v9.s }[2], [x25]\n"
+    "ld1 { v13.s }[2], [x24]\n"
+    "ld1 { v11.s }[2], [x23]\n"
+    "ld1 { v12.s }[2], [x22]\n"
+    "ld1 { v10.s }[2], [x21]\n"
+    "ld1 { v14.s }[2], [x20]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
+    "ldr s5, [x9, #0x0]\n"
+    "ldr s6, [x28, #0x0]\n"
+    "ldr s7, [x27, #0x0]\n"
+    "ldr s8, [x26, #0x0]\n"
+    "ldr s9, [x25, #0x0]\n"
+    "ldr s13, [x24, #0x0]\n"
+    "ldr s11, [x23, #0x0]\n"
+    "ldr s12, [x22, #0x0]\n"
+    "ldr s10, [x21, #0x0]\n"
+    "ldr s14, [x20, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
+    "mov v28.16b, v25.16b\n fmla v28.4s, v0.4s, v5.4s\n"
+    "mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v6.4s\n"
+    "add x20, x7, x15\n"
+    "mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+    "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v8.4s\n"
+    "fmla v28.4s, v1.4s, v6.4s\n"
+    "fmla v29.4s, v1.4s, v9.4s\n"
+    "fmla v30.4s, v1.4s, v8.4s\n"
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "fmla v30.4s, v2.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr d5, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v5.s }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr s5, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v31.4s, v2.4s, v5.4s\n"
+    "fmla v28.4s, v3.4s, v11.4s\n"
+    "add x20, x7, x13\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v5.4s\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d6, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v6.s }[2], [x20]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+    "ldr s6, [x20, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+    "fmla v31.4s, v3.4s, v6.4s\n"
+    "fmla v28.4s, v4.4s, v12.4s\n"
+    "add x20, x4, x11\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: End
+    "ldr q0, [x8, #0x0]\n"
+    "fmla v29.4s, v4.4s, v9.4s\n"
+    "fmla v30.4s, v4.4s, v6.4s\n"
+    "add x20, x17, x2\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "fmla v28.4s, v0.4s, v7.4s\n"
+    "add x8, x8, #0x10\n"
+    "fmla v29.4s, v0.4s, v8.4s\n"
+    "fmla v30.4s, v0.4s, v14.4s\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+    "ldr q1, [x8, #0x0]\n"
+    "fmla v31.4s, v0.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v8.4s\n"
+    "add x20, x17, x6\n"
+    "fmla v29.4s, v1.4s, v13.4s\n"
+    "fmla v30.4s, v1.4s, v11.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
+    "ldr q2, [x8, #0x0]\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "add x20, x17, x15\n"
+    "fmla v29.4s, v2.4s, v5.4s\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+    "ldr q3, [x8, #0x0]\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "fmla v28.4s, v3.4s, v5.4s\n"
+    "add x20, x17, x13\n"
+    "fmla v29.4s, v3.4s, v6.4s\n"
+    "fmla v30.4s, v3.4s, v9.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+    "ldr q4, [x8, #0x0]\n"
+    "fmla v31.4s, v3.4s, v13.4s\n"
+    "fmla v28.4s, v4.4s, v6.4s\n"
+    "add x20, x17, x11\n"
+    "fmla v29.4s, v4.4s, v10.4s\n"
+    "fmla v30.4s, v4.4s, v13.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr d8, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v8.s }[2], [x20]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
+    "ldr s8, [x20, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
+    "ldr q0, [x8, #0x0]\n"
+    "fmla v31.4s, v4.4s, v8.4s\n"
+    "fmla v28.4s, v0.4s, v14.4s\n"
+    "add x20, x16, XZR\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr d5, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v5.s }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr s5, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v30.4s, v0.4s, v5.4s\n"
+    "add x20, x16, x2\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr d6, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v6.s }[2], [x20]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr s6, [x20, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "ldr q1, [x8, #0x0]\n"
+    "fmla v31.4s, v0.4s, v6.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "add x20, x16, x6\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "fmla v30.4s, v1.4s, v6.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "ldr q2, [x8, #0x0]\n"
+    "fmla v31.4s, v1.4s, v10.4s\n"
+    "fmla v28.4s, v2.4s, v12.4s\n"
+    "add x20, x16, x15\n"
+    "fmla v29.4s, v2.4s, v9.4s\n"
+    "fmla v30.4s, v2.4s, v10.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "ldr q3, [x8, #0x0]\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v9.4s\n"
+    "add x20, x16, x13\n"
+    "fmla v29.4s, v3.4s, v13.4s\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+    "ldr q4, [x8, #0x0]\n"
+    "fmla v31.4s, v3.4s, v12.4s\n"
+    "fmla v28.4s, v4.4s, v13.4s\n"
+    "add x20, x16, x11\n"
+    "fmla v29.4s, v4.4s, v8.4s\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 34f\n"
+    "ld1 { v14.s }[2], [x20]\n"
+    "b 34f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
+    "ldr s14, [x20, #0x0]\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
+    "ldr q0, [x8, #0x0]\n"
+    "fmla v31.4s, v4.4s, v14.4s\n"
+    "fmla v28.4s, v0.4s, v5.4s\n"
+    "add x20, x14, XZR\n"
+    "fmla v29.4s, v0.4s, v6.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+    "fmla v30.4s, v0.4s, v9.4s\n"
+    "add x20, x14, x2\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 38f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 38f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+    "ldr q1, [x8, #0x0]\n"
+    "fmla v31.4s, v0.4s, v13.4s\n"
+    "fmla v28.4s, v1.4s, v6.4s\n"
+    "add x20, x14, x6\n"
+    "fmla v29.4s, v1.4s, v10.4s\n"
+    "fmla v30.4s, v1.4s, v13.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr d5, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v5.s }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+    "ldr s5, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+    "ldr q2, [x8, #0x0]\n"
+    "fmla v31.4s, v1.4s, v5.4s\n"
+    "fmla v28.4s, v2.4s, v10.4s\n"
+    "add x20, x14, x15\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "fmla v30.4s, v2.4s, v5.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ldr d6, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 42f\n"
+    "ld1 { v6.s }[2], [x20]\n"
+    "b 42f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+    "ldr s6, [x20, #0x0]\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+    "ldr q3, [x8, #0x0]\n"
+    "fmla v31.4s, v2.4s, v6.4s\n"
+    "fmla v28.4s, v3.4s, v11.4s\n"
+    "add x20, x14, x13\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v6.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr d8, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v8.s }[2], [x20]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+    "ldr s8, [x20, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+    "ldr q4, [x8, #0x0]\n"
+    "fmla v31.4s, v3.4s, v8.4s\n"
+    "fmla v28.4s, v4.4s, v12.4s\n"
+    "add x20, x14, x11\n"
+    "fmla v29.4s, v4.4s, v14.4s\n"
+    "fmla v30.4s, v4.4s, v8.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 46f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 46f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
+    "ldr q0, [x8, #0x0]\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "fmla v28.4s, v0.4s, v9.4s\n"
+    "add x20, x12, XZR\n"
+    "fmla v29.4s, v0.4s, v13.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "add x20, x12, x2\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 50f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 50f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
+    "ldr q1, [x8, #0x0]\n"
+    "fmla v31.4s, v0.4s, v12.4s\n"
+    "fmla v28.4s, v1.4s, v13.4s\n"
+    "add x20, x12, x6\n"
+    "fmla v29.4s, v1.4s, v5.4s\n"
+    "fmla v30.4s, v1.4s, v12.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
+    "ldr q2, [x8, #0x0]\n"
+    "fmla v31.4s, v1.4s, v9.4s\n"
+    "fmla v28.4s, v2.4s, v5.4s\n"
+    "add x20, x12, x15\n"
+    "fmla v29.4s, v2.4s, v6.4s\n"
+    "fmla v30.4s, v2.4s, v9.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 54f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 54f\n"
+    "53:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "54:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
+    "ldr q3, [x8, #0x0]\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v6.4s\n"
+    "add x20, x12, x13\n"
+    "fmla v29.4s, v3.4s, v8.4s\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 55f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "56:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
+    "ldr q4, [x8, #0x0]\n"
+    "fmla v31.4s, v3.4s, v12.4s\n"
+    "fmla v28.4s, v4.4s, v8.4s\n"
+    "add x20, x12, x11\n"
+    "fmla v29.4s, v4.4s, v10.4s\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 57f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 58f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 58f\n"
+    "57:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "58:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
+    "fmla v31.4s, v4.4s, v9.4s\n"
+    "fmax v28.4s, v28.4s, v27.4s\n"
+    "fmax v29.4s, v29.4s, v27.4s\n"
+    "fmax v30.4s, v30.4s, v27.4s\n"
+    "fmax v31.4s, v31.4s, v27.4s\n"
+    "fmin v28.4s, v28.4s, v15.4s\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "tbz %x[n_channels], #1, 59f\n"
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.d }[0], [x21], x3\n"
+    "st1 { v30.d }[0], [x20], x3\n"
+    "add x5, x5, #0x8\n"
+    "add x10, x10, #0x8\n"
+    "st1 { v29.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.s }[2], [x21], x3\n"
+    "st1 { v30.s }[2], [x20], x3\n"
+    "st1 { v29.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "b 60f\n"
+    "59:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.s }[0], [x21], x3\n"
+    "st1 { v30.s }[0], [x20], x3\n"
+    "st1 { v29.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "60:"  // Tile loop: Oddments: Store: Bit 1: End
+    "61:"  // Tile loop: End
+    "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x26, x26, #0x1\n"
+    "add x21, x27, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x26, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x27, x27, x21, LT\n"
+    "csel x26, x26, XZR, LT\n"
+    "cmp x27, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..32939eb6dc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1043 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[36];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[0];
+      inptrs[1] = input_ptrs[1];
+      inptrs[2] = input_ptrs[6];
+      inptrs[3] = input_ptrs[7];
+      inptrs[4] = input_ptrs[2];
+      inptrs[5] = input_ptrs[8];
+      inptrs[6] = input_ptrs[3];
+      inptrs[7] = input_ptrs[4];
+      inptrs[8] = input_ptrs[11];
+      inptrs[9] = input_ptrs[12];
+      inptrs[10] = input_ptrs[9];
+      inptrs[11] = input_ptrs[10];
+      inptrs[12] = input_ptrs[5];
+      inptrs[13] = input_ptrs[13];
+      inptrs[14] = input_ptrs[14];
+      inptrs[15] = input_ptrs[15];
+      inptrs[16] = input_ptrs[16];
+      inptrs[17] = input_ptrs[17];
+      inptrs[18] = input_ptrs[18];
+      inptrs[19] = input_ptrs[19];
+      inptrs[20] = input_ptrs[20];
+      inptrs[21] = input_ptrs[21];
+      inptrs[22] = input_ptrs[22];
+      inptrs[23] = input_ptrs[23];
+      inptrs[24] = input_ptrs[24];
+      inptrs[25] = input_ptrs[25];
+      inptrs[26] = input_ptrs[26];
+      inptrs[27] = input_ptrs[27];
+      inptrs[28] = input_ptrs[28];
+      inptrs[29] = input_ptrs[29];
+      inptrs[30] = input_ptrs[30];
+      inptrs[31] = input_ptrs[31];
+      inptrs[32] = input_ptrs[32];
+      inptrs[33] = input_ptrs[33];
+      inptrs[34] = input_ptrs[34];
+      inptrs[35] = input_ptrs[35];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "mov x17, #0x10\n"  // cntb _, ALL, #1
+    "lsr x9, %x[n_channels], #0x2\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v27.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldp x14, x13, [x21, #0x0]\n"
+    "ldp x12, x11, [x21, #0x10]\n"
+    "mov x10, #0x0\n"
+    "sub x28, XZR, x17\n"
+    "cbz x9, 3f\n"
+    "ldr q26, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "cmp x17, x9, LSL #4\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "add x16, x16, #0x60\n"
+    "ldp x21, x20, [x15, #0x0]\n"
+    "ldr q5, [x21, x10]\n"
+    "ldr q6, [x20, x10]\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr q7, [x21, x10]\n"
+    "ldr q8, [x20, x10]\n"
+    "ldp x21, x20, [x15, #0x20]\n"
+    "ldr q9, [x21, x10]\n"
+    "ldr q13, [x20, x10]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ldr q11, [x21, x10]\n"
+    "ldr q12, [x20, x10]\n"
+    "ldp x21, x20, [x15, #0x40]\n"
+    "ldr q10, [x21, x10]\n"
+    "ldr q14, [x20, x10]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v5.4s\n"
+    "mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v6.4s\n"
+    "ldr x20, [x15, #0x50]\n"
+    "ldr q24, [x20, x10]\n"
+    "mov v28.16b, v26.16b\n fmla v28.4s, v0.4s, v7.4s\n"
+    "mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v8.4s\n"
+    "ldr q23, [x16, #0x0]\n"
+    "ldr q26, [x16, #0x140]\n"
+    "fmla v30.4s, v1.4s, v6.4s\n"
+    "fmla v31.4s, v1.4s, v9.4s\n"
+    "ldr x20, [x15, #0x58]\n"
+    "ldr q22, [x20, x10]\n"
+    "fmla v28.4s, v1.4s, v8.4s\n"
+    "fmla v29.4s, v1.4s, v13.4s\n"
+    "ldr q21, [x16, #0x10]\n"
+    "ldr x20, [x15, #0x60]\n"
+    "fmla v30.4s, v2.4s, v9.4s\n"
+    "ldr q17, [x20, x10]\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "ldr x20, [x15, #0x68]\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "fmla v29.4s, v2.4s, v24.4s\n"
+    "ldr q16, [x16, #0x20]\n"
+    "ldr x22, [x15, #0x70]\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "ldr q5, [x20, x10]\n"
+    "fmla v31.4s, v3.4s, v12.4s\n"
+    "ldr x20, [x15, #0x78]\n"
+    "fmla v28.4s, v3.4s, v24.4s\n"
+    "fmla v29.4s, v3.4s, v22.4s\n"
+    "ldr q20, [x16, #0x30]\n"
+    "ldr x21, [x15, #0x80]\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "ldr q19, [x22, x10]\n"
+    "fmla v31.4s, v4.4s, v17.4s\n"
+    "ldr q2, [x20, x10]\n"
+    "fmla v28.4s, v4.4s, v22.4s\n"
+    "fmla v29.4s, v4.4s, v10.4s\n"
+    "ldr q18, [x16, #0x40]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla v30.4s, v23.4s, v7.4s\n"
+    "fmla v31.4s, v23.4s, v8.4s\n"
+    "ldr x23, [x15, #0x90]\n"
+    "ldr x26, [x15, #0x98]\n"
+    "fmla v28.4s, v23.4s, v14.4s\n"
+    "fmla v29.4s, v23.4s, v5.4s\n"
+    "ldr q1, [x16, #0x50]\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "fmla v30.4s, v21.4s, v8.4s\n"
+    "ldr q25, [x20, x10]\n"
+    "fmla v31.4s, v21.4s, v13.4s\n"
+    "ldr x25, [x15, #0xa8]\n"
+    "fmla v28.4s, v21.4s, v5.4s\n"
+    "fmla v29.4s, v21.4s, v19.4s\n"
+    "ldr q17, [x16, #0x60]\n"
+    "ldr x24, [x15, #0xb0]\n"
+    "fmla v30.4s, v16.4s, v13.4s\n"
+    "ldr q8, [x21, x10]\n"
+    "fmla v31.4s, v16.4s, v24.4s\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v28.4s, v16.4s, v19.4s\n"
+    "fmla v29.4s, v16.4s, v2.4s\n"
+    "ldr q16, [x16, #0x70]\n"
+    "ldr x21, [x15, #0xc0]\n"
+    "fmla v30.4s, v20.4s, v24.4s\n"
+    "ldr q24, [x23, x10]\n"
+    "fmla v31.4s, v20.4s, v22.4s\n"
+    "ldr x27, [x15, #0xc8]\n"
+    "fmla v28.4s, v20.4s, v2.4s\n"
+    "fmla v29.4s, v20.4s, v8.4s\n"
+    "ldr q23, [x16, #0x80]\n"
+    "ldr x23, [x15, #0xd0]\n"
+    "fmla v30.4s, v18.4s, v22.4s\n"
+    "ldr q22, [x26, x10]\n"
+    "fmla v31.4s, v18.4s, v10.4s\n"
+    "ldr q21, [x22, x10]\n"
+    "fmla v28.4s, v18.4s, v8.4s\n"
+    "fmla v29.4s, v18.4s, v25.4s\n"
+    "ldr q20, [x16, #0x90]\n"
+    "ldr x22, [x15, #0xd8]\n"
+    "fmla v30.4s, v1.4s, v14.4s\n"
+    "ldr q0, [x20, x10]\n"
+    "fmla v31.4s, v1.4s, v5.4s\n"
+    "ldr x20, [x15, #0xe0]\n"
+    "fmla v28.4s, v1.4s, v24.4s\n"
+    "fmla v29.4s, v1.4s, v22.4s\n"
+    "ldr q6, [x16, #0xa0]\n"
+    "ldr x26, [x15, #0xf8]\n"
+    "fmla v30.4s, v17.4s, v5.4s\n"
+    "ldr q1, [x25, x10]\n"
+    "fmla v31.4s, v17.4s, v19.4s\n"
+    "ldr x25, [x15, #0xe8]\n"
+    "fmla v28.4s, v17.4s, v22.4s\n"
+    "fmla v29.4s, v17.4s, v21.4s\n"
+    "ldr q18, [x16, #0xb0]\n"
+    "add x28, x28, #0x10\n"
+    "fmla v30.4s, v16.4s, v19.4s\n"
+    "ldr q19, [x24, x10]\n"
+    "fmla v31.4s, v16.4s, v2.4s\n"
+    "ldr x24, [x15, #0xf0]\n"
+    "fmla v28.4s, v16.4s, v21.4s\n"
+    "fmla v29.4s, v16.4s, v1.4s\n"
+    "ldr q17, [x16, #0xc0]\n"
+    "fmla v30.4s, v23.4s, v2.4s\n"
+    "ldr q16, [x21, x10]\n"
+    "fmla v31.4s, v23.4s, v8.4s\n"
+    "ldr x21, [x15, #0x100]\n"
+    "fmla v28.4s, v23.4s, v1.4s\n"
+    "fmla v29.4s, v23.4s, v19.4s\n"
+    "ldr q13, [x16, #0xd0]\n"
+    "fmla v30.4s, v20.4s, v8.4s\n"
+    "ldr q2, [x27, x10]\n"
+    "fmla v31.4s, v20.4s, v25.4s\n"
+    "ldr q10, [x20, x10]\n"
+    "fmla v28.4s, v20.4s, v19.4s\n"
+    "fmla v29.4s, v20.4s, v0.4s\n"
+    "ldr q9, [x16, #0xe0]\n"
+    "ldr x20, [x15, #0x108]\n"
+    "fmla v30.4s, v6.4s, v24.4s\n"
+    "ldr q5, [x23, x10]\n"
+    "fmla v31.4s, v6.4s, v22.4s\n"
+    "ldr x23, [x15, #0x110]\n"
+    "fmla v28.4s, v6.4s, v16.4s\n"
+    "fmla v29.4s, v6.4s, v2.4s\n"
+    "ldr q24, [x16, #0xf0]\n"
+    "fmla v30.4s, v18.4s, v22.4s\n"
+    "ldr q25, [x22, x10]\n"
+    "fmla v31.4s, v18.4s, v21.4s\n"
+    "ldr x22, [x15, #0x118]\n"
+    "fmla v28.4s, v18.4s, v2.4s\n"
+    "fmla v29.4s, v18.4s, v5.4s\n"
+    "ldr q23, [x16, #0x100]\n"
+    "fmla v30.4s, v17.4s, v21.4s\n"
+    "ldr q22, [x25, x10]\n"
+    "fmla v31.4s, v17.4s, v1.4s\n"
+    "fmla v28.4s, v17.4s, v5.4s\n"
+    "fmla v29.4s, v17.4s, v25.4s\n"
+    "ldr q21, [x16, #0x110]\n"
+    "fmla v30.4s, v13.4s, v1.4s\n"
+    "ldr q18, [x24, x10]\n"
+    "fmla v31.4s, v13.4s, v19.4s\n"
+    "fmla v28.4s, v13.4s, v25.4s\n"
+    "fmla v29.4s, v13.4s, v10.4s\n"
+    "ldr q20, [x16, #0x120]\n"
+    "fmla v30.4s, v9.4s, v19.4s\n"
+    "ldr q17, [x26, x10]\n"
+    "fmla v31.4s, v9.4s, v0.4s\n"
+    "fmla v28.4s, v9.4s, v10.4s\n"
+    "fmla v29.4s, v9.4s, v22.4s\n"
+    "ldr q19, [x16, #0x130]\n"
+    "fmla v30.4s, v24.4s, v16.4s\n"
+    "ldr q16, [x21, x10]\n"
+    "fmla v31.4s, v24.4s, v2.4s\n"
+    "fmla v28.4s, v24.4s, v18.4s\n"
+    "ldr q18, [x20, x10]\n"
+    "fmla v29.4s, v24.4s, v17.4s\n"
+    "ldr q0, [x16, #0x150]\n"
+    "fmla v30.4s, v23.4s, v2.4s\n"
+    "fmla v31.4s, v23.4s, v5.4s\n"
+    "ldp x21, x20, [x15, #0x0]\n"
+    "fmla v28.4s, v23.4s, v17.4s\n"
+    "ldr q17, [x23, x10]\n"
+    "fmla v29.4s, v23.4s, v16.4s\n"
+    "ldr q1, [x16, #0x160]\n"
+    "fmla v30.4s, v21.4s, v5.4s\n"
+    "ldr q5, [x21, x17]\n"
+    "fmla v31.4s, v21.4s, v25.4s\n"
+    "fmla v28.4s, v21.4s, v16.4s\n"
+    "ldr q16, [x22, x10]\n"
+    "fmla v29.4s, v21.4s, v18.4s\n"
+    "ldr q2, [x16, #0x170]\n"
+    "fmla v30.4s, v20.4s, v25.4s\n"
+    "ldr q6, [x20, x17]\n"
+    "fmla v31.4s, v20.4s, v10.4s\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr q7, [x21, x17]\n"
+    "fmla v28.4s, v20.4s, v18.4s\n"
+    "fmla v29.4s, v20.4s, v17.4s\n"
+    "ldr q3, [x16, #0x180]\n"
+    "fmla v30.4s, v19.4s, v10.4s\n"
+    "ldr q8, [x20, x17]\n"
+    "fmla v31.4s, v19.4s, v22.4s\n"
+    "ldp x21, x20, [x15, #0x20]\n"
+    "ldr q13, [x20, x17]\n"
+    "fmla v28.4s, v19.4s, v17.4s\n"
+    "fmla v29.4s, v19.4s, v16.4s\n"
+    "ldr q9, [x21, x17]\n"
+    "ldr q4, [x16, #0x190]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "fmax v30.4s, v30.4s, v27.4s\n"
+    "fmax v31.4s, v31.4s, v27.4s\n"
+    "ldr q11, [x21, x17]\n"
+    "ldr q12, [x20, x17]\n"
+    "fmax v28.4s, v28.4s, v27.4s\n"
+    "fmax v29.4s, v29.4s, v27.4s\n"
+    "ldp x21, x20, [x15, #0x40]\n"
+    "ldr q10, [x21, x17]\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "ldr q14, [x20, x17]\n"
+    "add x17, x17, #0x10\n"
+    "cmp x17, x9, LSL #4\n"
+    "fmin v28.4s, v28.4s, v15.4s\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "add x10, x10, #0x10\n"
+    "str q30, [x14, x28]\n"
+    "add x16, x16, #0x1a0\n"
+    "str q31, [x13, x28]\n"
+    "str q28, [x12, x28]\n"
+    "str q29, [x11, x28]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+    "mov v5.16b, v26.16b\n fmla v5.4s, v0.4s, v6.4s\n"
+    "ldr x20, [x15, #0x50]\n"
+    "ldr q22, [x20, x10]\n"
+    "mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+    "mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v8.4s\n"
+    "ldr q19, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x58]\n"
+    "fmla v31.4s, v1.4s, v6.4s\n"
+    "ldr q21, [x20, x10]\n"
+    "fmla v5.4s, v1.4s, v9.4s\n"
+    "ldr x21, [x15, #0x60]\n"
+    "fmla v30.4s, v1.4s, v8.4s\n"
+    "fmla v29.4s, v1.4s, v13.4s\n"
+    "ldr q18, [x16, #0x10]\n"
+    "ldr x20, [x15, #0x68]\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "ldr q16, [x21, x10]\n"
+    "fmla v5.4s, v2.4s, v11.4s\n"
+    "ldr x23, [x15, #0x70]\n"
+    "fmla v30.4s, v2.4s, v13.4s\n"
+    "fmla v29.4s, v2.4s, v22.4s\n"
+    "ldr q17, [x16, #0x20]\n"
+    "ldr x21, [x15, #0x78]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr q6, [x20, x10]\n"
+    "fmla v5.4s, v3.4s, v12.4s\n"
+    "ldr x22, [x15, #0x80]\n"
+    "fmla v30.4s, v3.4s, v22.4s\n"
+    "fmla v29.4s, v3.4s, v21.4s\n"
+    "ldr q20, [x16, #0x30]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "ldr q2, [x23, x10]\n"
+    "fmla v5.4s, v4.4s, v16.4s\n"
+    "ldr q28, [x21, x10]\n"
+    "fmla v30.4s, v4.4s, v21.4s\n"
+    "fmla v29.4s, v4.4s, v10.4s\n"
+    "ldr q16, [x16, #0x40]\n"
+    "ldr x21, [x15, #0x90]\n"
+    "fmla v31.4s, v19.4s, v7.4s\n"
+    "fmla v5.4s, v19.4s, v8.4s\n"
+    "ldr x27, [x15, #0x98]\n"
+    "ldr x26, [x15, #0xa0]\n"
+    "fmla v30.4s, v19.4s, v14.4s\n"
+    "fmla v29.4s, v19.4s, v6.4s\n"
+    "ldr q19, [x16, #0x50]\n"
+    "ldr x25, [x15, #0xa8]\n"
+    "fmla v31.4s, v18.4s, v8.4s\n"
+    "ldr q1, [x20, x10]\n"
+    "fmla v5.4s, v18.4s, v13.4s\n"
+    "ldr x24, [x15, #0xb0]\n"
+    "fmla v30.4s, v18.4s, v6.4s\n"
+    "fmla v29.4s, v18.4s, v2.4s\n"
+    "ldr q18, [x16, #0x60]\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v31.4s, v17.4s, v13.4s\n"
+    "ldr q26, [x22, x10]\n"
+    "fmla v5.4s, v17.4s, v22.4s\n"
+    "ldr x23, [x15, #0xc0]\n"
+    "fmla v30.4s, v17.4s, v2.4s\n"
+    "fmla v29.4s, v17.4s, v28.4s\n"
+    "ldr q17, [x16, #0x70]\n"
+    "ldr x22, [x15, #0xc8]\n"
+    "fmla v31.4s, v20.4s, v22.4s\n"
+    "ldr q25, [x21, x10]\n"
+    "fmla v5.4s, v20.4s, v21.4s\n"
+    "ldr x21, [x15, #0xd0]\n"
+    "fmla v30.4s, v20.4s, v28.4s\n"
+    "fmla v29.4s, v20.4s, v26.4s\n"
+    "ldr q24, [x16, #0x80]\n"
+    "add x28, x28, #0x10\n"
+    "fmla v31.4s, v16.4s, v21.4s\n"
+    "ldr q23, [x27, x10]\n"
+    "fmla v5.4s, v16.4s, v10.4s\n"
+    "ldr q0, [x26, x10]\n"
+    "fmla v30.4s, v16.4s, v26.4s\n"
+    "fmla v29.4s, v16.4s, v1.4s\n"
+    "ldr q22, [x16, #0x90]\n"
+    "ldr x27, [x15, #0xd8]\n"
+    "fmla v31.4s, v19.4s, v14.4s\n"
+    "ldr q16, [x20, x10]\n"
+    "fmla v5.4s, v19.4s, v6.4s\n"
+    "ldr x20, [x15, #0xe0]\n"
+    "fmla v30.4s, v19.4s, v25.4s\n"
+    "fmla v29.4s, v19.4s, v23.4s\n"
+    "ldr q21, [x16, #0xa0]\n"
+    "ldr x26, [x15, #0xf8]\n"
+    "fmla v31.4s, v18.4s, v6.4s\n"
+    "ldr q20, [x25, x10]\n"
+    "fmla v5.4s, v18.4s, v2.4s\n"
+    "ldr x25, [x15, #0xe8]\n"
+    "fmla v30.4s, v18.4s, v23.4s\n"
+    "fmla v29.4s, v18.4s, v0.4s\n"
+    "ldr q18, [x16, #0xb0]\n"
+    "fmla v31.4s, v17.4s, v2.4s\n"
+    "ldr q19, [x24, x10]\n"
+    "fmla v5.4s, v17.4s, v28.4s\n"
+    "ldr x24, [x15, #0xf0]\n"
+    "fmla v30.4s, v17.4s, v0.4s\n"
+    "fmla v29.4s, v17.4s, v20.4s\n"
+    "ldr q17, [x16, #0xc0]\n"
+    "fmla v31.4s, v24.4s, v28.4s\n"
+    "ldr q7, [x23, x10]\n"
+    "fmla v5.4s, v24.4s, v26.4s\n"
+    "ldr x23, [x15, #0x100]\n"
+    "fmla v30.4s, v24.4s, v20.4s\n"
+    "fmla v29.4s, v24.4s, v19.4s\n"
+    "ldr q3, [x16, #0xd0]\n"
+    "fmla v31.4s, v22.4s, v26.4s\n"
+    "ldr q28, [x22, x10]\n"
+    "fmla v5.4s, v22.4s, v1.4s\n"
+    "ldr q13, [x20, x10]\n"
+    "fmla v30.4s, v22.4s, v19.4s\n"
+    "fmla v29.4s, v22.4s, v16.4s\n"
+    "ldr q11, [x16, #0xe0]\n"
+    "ldr x22, [x15, #0x108]\n"
+    "fmla v31.4s, v21.4s, v25.4s\n"
+    "ldr q26, [x21, x10]\n"
+    "fmla v5.4s, v21.4s, v23.4s\n"
+    "ldr x21, [x15, #0x110]\n"
+    "fmla v30.4s, v21.4s, v7.4s\n"
+    "fmla v29.4s, v21.4s, v28.4s\n"
+    "ldr q25, [x16, #0xf0]\n"
+    "fmla v31.4s, v18.4s, v23.4s\n"
+    "ldr q24, [x27, x10]\n"
+    "fmla v5.4s, v18.4s, v0.4s\n"
+    "ldr x20, [x15, #0x118]\n"
+    "fmla v30.4s, v18.4s, v28.4s\n"
+    "fmla v29.4s, v18.4s, v26.4s\n"
+    "ldr q23, [x16, #0x100]\n"
+    "fmla v31.4s, v17.4s, v0.4s\n"
+    "ldr q22, [x25, x10]\n"
+    "fmla v5.4s, v17.4s, v20.4s\n"
+    "fmla v30.4s, v17.4s, v26.4s\n"
+    "fmla v29.4s, v17.4s, v24.4s\n"
+    "ldr q21, [x16, #0x110]\n"
+    "fmla v31.4s, v3.4s, v20.4s\n"
+    "ldr q18, [x24, x10]\n"
+    "fmla v5.4s, v3.4s, v19.4s\n"
+    "fmla v30.4s, v3.4s, v24.4s\n"
+    "fmla v29.4s, v3.4s, v13.4s\n"
+    "ldr q20, [x16, #0x120]\n"
+    "fmla v31.4s, v11.4s, v19.4s\n"
+    "ldr q17, [x26, x10]\n"
+    "fmla v5.4s, v11.4s, v16.4s\n"
+    "fmla v30.4s, v11.4s, v13.4s\n"
+    "fmla v29.4s, v11.4s, v22.4s\n"
+    "ldr q19, [x16, #0x130]\n"
+    "add x16, x16, #0x140\n"
+    "fmla v31.4s, v25.4s, v7.4s\n"
+    "ldr q16, [x23, x10]\n"
+    "fmla v5.4s, v25.4s, v28.4s\n"
+    "fmla v30.4s, v25.4s, v18.4s\n"
+    "ldr q18, [x22, x10]\n"
+    "fmla v29.4s, v25.4s, v17.4s\n"
+    "fmla v31.4s, v23.4s, v28.4s\n"
+    "fmla v5.4s, v23.4s, v26.4s\n"
+    "fmla v30.4s, v23.4s, v17.4s\n"
+    "ldr q17, [x21, x10]\n"
+    "fmla v29.4s, v23.4s, v16.4s\n"
+    "fmla v31.4s, v21.4s, v26.4s\n"
+    "fmla v5.4s, v21.4s, v24.4s\n"
+    "fmla v30.4s, v21.4s, v16.4s\n"
+    "ldr q16, [x20, x10]\n"
+    "fmla v29.4s, v21.4s, v18.4s\n"
+    "add x10, x10, #0x10\n"
+    "fmla v31.4s, v20.4s, v24.4s\n"
+    "fmla v5.4s, v20.4s, v13.4s\n"
+    "fmla v30.4s, v20.4s, v18.4s\n"
+    "fmla v29.4s, v20.4s, v17.4s\n"
+    "fmla v31.4s, v19.4s, v13.4s\n"
+    "fmla v5.4s, v19.4s, v22.4s\n"
+    "fmax v31.4s, v31.4s, v27.4s\n"
+    "fmla v30.4s, v19.4s, v17.4s\n"
+    "fmla v29.4s, v19.4s, v16.4s\n"
+    "fmax v5.4s, v5.4s, v27.4s\n"
+    "fmax v30.4s, v30.4s, v27.4s\n"
+    "fmax v29.4s, v29.4s, v27.4s\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "fmin v5.4s, v5.4s, v15.4s\n"
+    "str q31, [x14, x28]\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "str q5, [x13, x28]\n"
+    "str q30, [x12, x28]\n"
+    "str q29, [x11, x28]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 60f\n"
+    "ldr q26, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "mov x20, x10\n"
+    "add x14, x14, x20\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "add x13, x13, x20\n"
+    "add x12, x12, x20\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "add x11, x11, x20\n"
+    "ldr x9, [x15, #0x0]\n"
+    "ldr x28, [x15, #0x8]\n"
+    "add x9, x9, x10\n"
+    "add x28, x28, x10\n"
+    "ldr x27, [x15, #0x10]\n"
+    "ldr x26, [x15, #0x18]\n"
+    "add x27, x27, x10\n"
+    "add x26, x26, x10\n"
+    "ldr x25, [x15, #0x20]\n"
+    "ldr x24, [x15, #0x28]\n"
+    "add x25, x25, x10\n"
+    "add x24, x24, x10\n"
+    "ldr x23, [x15, #0x30]\n"
+    "ldr x22, [x15, #0x38]\n"
+    "add x23, x23, x10\n"
+    "add x22, x22, x10\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "add x21, x21, x10\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x60\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v5.d }[0], [x9], #0x8\n"
+    "ld1 { v6.d }[0], [x28], #0x8\n"
+    "ld1 { v7.d }[0], [x27], #0x8\n"
+    "ld1 { v8.d }[0], [x26], #0x8\n"
+    "ld1 { v9.d }[0], [x25], #0x8\n"
+    "ld1 { v13.d }[0], [x24], #0x8\n"
+    "ld1 { v11.d }[0], [x23], #0x8\n"
+    "ld1 { v12.d }[0], [x22], #0x8\n"
+    "ld1 { v10.d }[0], [x21], #0x8\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v5.s }[2], [x9], #0x4\n"
+    "ld1 { v6.s }[2], [x28], #0x4\n"
+    "ld1 { v7.s }[2], [x27], #0x4\n"
+    "ld1 { v8.s }[2], [x26], #0x4\n"
+    "ld1 { v9.s }[2], [x25], #0x4\n"
+    "ld1 { v13.s }[2], [x24], #0x4\n"
+    "ld1 { v11.s }[2], [x23], #0x4\n"
+    "ld1 { v12.s }[2], [x22], #0x4\n"
+    "ld1 { v10.s }[2], [x21], #0x4\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
+    "ld1 { v5.s }[0], [x9], #0x4\n"
+    "ld1 { v6.s }[0], [x28], #0x4\n"
+    "ld1 { v7.s }[0], [x27], #0x4\n"
+    "ld1 { v8.s }[0], [x26], #0x4\n"
+    "ld1 { v9.s }[0], [x25], #0x4\n"
+    "ld1 { v13.s }[0], [x24], #0x4\n"
+    "ld1 { v11.s }[0], [x23], #0x4\n"
+    "ld1 { v12.s }[0], [x22], #0x4\n"
+    "ld1 { v10.s }[0], [x21], #0x4\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "5:"  // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
+    "mov v28.16b, v26.16b\n fmla v28.4s, v0.4s, v5.4s\n"
+    "mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v6.4s\n"
+    "ldr x20, [x15, #0x50]\n"
+    "add x20, x20, x10\n"
+    "mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+    "mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v8.4s\n"
+    "fmla v28.4s, v1.4s, v6.4s\n"
+    "fmla v29.4s, v1.4s, v9.4s\n"
+    "fmla v30.4s, v1.4s, v8.4s\n"
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "fmla v30.4s, v2.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "7:"  // Oddments: Load input (1, 3): Bit 1: End
+    "ldr x20, [x15, #0x58]\n"
+    "fmla v31.4s, v2.4s, v5.4s\n"
+    "fmla v28.4s, v3.4s, v11.4s\n"
+    "add x20, x20, x10\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v5.4s\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v6.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (1, 4): Bit 1: Unset
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "9:"  // Oddments: Load input (1, 4): Bit 1: End
+    "ldr x20, [x15, #0x60]\n"
+    "fmla v31.4s, v3.4s, v6.4s\n"
+    "fmla v28.4s, v4.4s, v12.4s\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (0, 5): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "11:"  // Oddments: Load input (0, 5): Bit 1: End
+    "ldr q0, [x16, #0x0]\n"
+    "fmla v29.4s, v4.4s, v9.4s\n"
+    "fmla v30.4s, v4.4s, v6.4s\n"
+    "ldr x20, [x15, #0x68]\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "fmla v28.4s, v0.4s, v7.4s\n"
+    "add x20, x20, x10\n"
+    "fmla v29.4s, v0.4s, v8.4s\n"
+    "fmla v30.4s, v0.4s, v14.4s\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (2, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "13:"  // Oddments: Load input (2, 1): Bit 1: End
+    "ldr q1, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "fmla v31.4s, v0.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v8.4s\n"
+    "fmla v29.4s, v1.4s, v13.4s\n"
+    "fmla v30.4s, v1.4s, v11.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 2): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "15:"  // Oddments: Load input (2, 2): Bit 1: End
+    "ldr q2, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "fmla v29.4s, v2.4s, v5.4s\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (2, 3): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "17:"  // Oddments: Load input (2, 3): Bit 1: End
+    "ldr q3, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x80]\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "fmla v28.4s, v3.4s, v5.4s\n"
+    "fmla v29.4s, v3.4s, v6.4s\n"
+    "fmla v30.4s, v3.4s, v9.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (2, 4): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "19:"  // Oddments: Load input (2, 4): Bit 1: End
+    "ldr q4, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla v31.4s, v3.4s, v13.4s\n"
+    "fmla v28.4s, v4.4s, v6.4s\n"
+    "fmla v29.4s, v4.4s, v10.4s\n"
+    "fmla v30.4s, v4.4s, v13.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v8.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v8.s }[2], [x20], #0x4\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (2, 5): Bit 1: Unset
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "21:"  // Oddments: Load input (2, 5): Bit 1: End
+    "ldr q0, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x90]\n"
+    "fmla v31.4s, v4.4s, v8.4s\n"
+    "fmla v28.4s, v0.4s, v14.4s\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "23:"  // Oddments: Load input (3, 0): Bit 1: End
+    "ldr x20, [x15, #0x98]\n"
+    "fmla v30.4s, v0.4s, v5.4s\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v6.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "25:"  // Oddments: Load input (3, 1): Bit 1: End
+    "ldr q1, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xa0]\n"
+    "fmla v31.4s, v0.4s, v6.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "fmla v30.4s, v1.4s, v6.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "27:"  // Oddments: Load input (3, 2): Bit 1: End
+    "ldr q2, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "fmla v31.4s, v1.4s, v10.4s\n"
+    "fmla v28.4s, v2.4s, v12.4s\n"
+    "fmla v29.4s, v2.4s, v9.4s\n"
+    "fmla v30.4s, v2.4s, v10.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 29f\n"
+    "28:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "29:"  // Oddments: Load input (3, 3): Bit 1: End
+    "ldr q3, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v9.4s\n"
+    "fmla v29.4s, v3.4s, v13.4s\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (3, 4): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "31:"  // Oddments: Load input (3, 4): Bit 1: End
+    "ldr q4, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v31.4s, v3.4s, v12.4s\n"
+    "fmla v28.4s, v4.4s, v13.4s\n"
+    "fmla v29.4s, v4.4s, v8.4s\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 33f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "b 33f\n"
+    "32:"  // Oddments: Load input (3, 5): Bit 1: Unset
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "33:"  // Oddments: Load input (3, 5): Bit 1: End
+    "ldr q0, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla v31.4s, v4.4s, v14.4s\n"
+    "fmla v28.4s, v0.4s, v5.4s\n"
+    "fmla v29.4s, v0.4s, v6.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (4, 0): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "35:"  // Oddments: Load input (4, 0): Bit 1: End
+    "ldr x20, [x15, #0xc8]\n"
+    "fmla v30.4s, v0.4s, v9.4s\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 37f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 37f\n"
+    "36:"  // Oddments: Load input (4, 1): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "37:"  // Oddments: Load input (4, 1): Bit 1: End
+    "ldr q1, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xd0]\n"
+    "fmla v31.4s, v0.4s, v13.4s\n"
+    "fmla v28.4s, v1.4s, v6.4s\n"
+    "fmla v29.4s, v1.4s, v10.4s\n"
+    "fmla v30.4s, v1.4s, v13.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (4, 2): Bit 1: Unset
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "39:"  // Oddments: Load input (4, 2): Bit 1: End
+    "ldr q2, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xd8]\n"
+    "fmla v31.4s, v1.4s, v5.4s\n"
+    "fmla v28.4s, v2.4s, v10.4s\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "fmla v30.4s, v2.4s, v5.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v6.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 41f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "b 41f\n"
+    "40:"  // Oddments: Load input (4, 3): Bit 1: Unset
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "41:"  // Oddments: Load input (4, 3): Bit 1: End
+    "ldr q3, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xe0]\n"
+    "fmla v31.4s, v2.4s, v6.4s\n"
+    "fmla v28.4s, v3.4s, v11.4s\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v6.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v8.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v8.s }[2], [x20], #0x4\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (4, 4): Bit 1: Unset
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "43:"  // Oddments: Load input (4, 4): Bit 1: End
+    "ldr q4, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xe8]\n"
+    "fmla v31.4s, v3.4s, v8.4s\n"
+    "fmla v28.4s, v4.4s, v12.4s\n"
+    "fmla v29.4s, v4.4s, v14.4s\n"
+    "fmla v30.4s, v4.4s, v8.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 45f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 45f\n"
+    "44:"  // Oddments: Load input (4, 5): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "45:"  // Oddments: Load input (4, 5): Bit 1: End
+    "ldr q0, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xf0]\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "fmla v28.4s, v0.4s, v9.4s\n"
+    "fmla v29.4s, v0.4s, v13.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (5, 0): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "47:"  // Oddments: Load input (5, 0): Bit 1: End
+    "ldr x20, [x15, #0xf8]\n"
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 49f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 49f\n"
+    "48:"  // Oddments: Load input (5, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "49:"  // Oddments: Load input (5, 1): Bit 1: End
+    "ldr q1, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x100]\n"
+    "fmla v31.4s, v0.4s, v12.4s\n"
+    "fmla v28.4s, v1.4s, v13.4s\n"
+    "fmla v29.4s, v1.4s, v5.4s\n"
+    "fmla v30.4s, v1.4s, v12.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (5, 2): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "51:"  // Oddments: Load input (5, 2): Bit 1: End
+    "ldr q2, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x108]\n"
+    "fmla v31.4s, v1.4s, v9.4s\n"
+    "fmla v28.4s, v2.4s, v5.4s\n"
+    "fmla v29.4s, v2.4s, v6.4s\n"
+    "fmla v30.4s, v2.4s, v9.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 53f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 53f\n"
+    "52:"  // Oddments: Load input (5, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "53:"  // Oddments: Load input (5, 3): Bit 1: End
+    "ldr q3, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x110]\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v6.4s\n"
+    "fmla v29.4s, v3.4s, v8.4s\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 54f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load input (5, 4): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "55:"  // Oddments: Load input (5, 4): Bit 1: End
+    "ldr q4, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x118]\n"
+    "fmla v31.4s, v3.4s, v12.4s\n"
+    "fmla v28.4s, v4.4s, v8.4s\n"
+    "fmla v29.4s, v4.4s, v10.4s\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #1, 56f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 57f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 57f\n"
+    "56:"  // Oddments: Load input (5, 5): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "57:"  // Oddments: Load input (5, 5): Bit 1: End
+    "fmla v31.4s, v4.4s, v9.4s\n"
+    "fmax v28.4s, v28.4s, v27.4s\n"
+    "fmax v29.4s, v29.4s, v27.4s\n"
+    "fmax v30.4s, v30.4s, v27.4s\n"
+    "fmax v31.4s, v31.4s, v27.4s\n"
+    "fmin v28.4s, v28.4s, v15.4s\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "tbz %x[n_channels], #1, 58f\n"
+    "st1 { v28.d }[0], [x14], #0x8\n"
+    "st1 { v29.d }[0], [x13], #0x8\n"
+    "st1 { v30.d }[0], [x12], #0x8\n"
+    "st1 { v31.d }[0], [x11], #0x8\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "st1 { v28.s }[2], [x14], #0x4\n"
+    "st1 { v29.s }[2], [x13], #0x4\n"
+    "st1 { v30.s }[2], [x12], #0x4\n"
+    "st1 { v31.s }[2], [x11], #0x4\n"
+    "b 59f\n"
+    "58:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v28.s }[0], [x14], #0x4\n"
+    "st1 { v29.s }[0], [x13], #0x4\n"
+    "st1 { v30.s }[0], [x12], #0x4\n"
+    "st1 { v31.s }[0], [x11], #0x4\n"
+    "59:"  // Oddments: Store: Bit 1: End
+    "60:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..8a8060770c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
+
+class a64_fp32_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<float, float, float, float>
+{
+  KernelType kernel = a64_fp32_nhwc_generic_output9_mla_depthfirst_impl;
+
+  public:
+  a64_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<float, float, float, float>(9, arm_gemm::VLType::None) {}
+
+  KernelType get_kernel() const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..a2f577784f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const void *bias,
+  const unsigned int n_points,
+  const unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ld1r { v2.4s }, [%x[minmax_vals]]\n"
+    "lsr x9, %x[n_channels], #0x2\n"
+    "add x20, %x[minmax_vals], #0x4\n"
+    "ld1r { v1.4s }, [x20]\n"
+    "mov x11, #0x0\n"
+    "cbz x9, 5f\n"
+    "1:"  // Channel loop
+    "movi v23.16b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ldr q23, [%x[bias], x11]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "ldr q0, [%x[params], #0x0]\n"
+    "mov x26, %x[inptrs]\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "subs x25, %x[n_points], #0x1\n"
+    "ldr q14, [x21, x11]\n"
+    "ldr q15, [x20, x11]\n"
+    "mov v24.16b, v23.16b\n"
+    "mov v25.16b, v23.16b\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "ldr q16, [x21, x11]\n"
+    "mov v26.16b, v23.16b\n"
+    "mov v27.16b, v23.16b\n"
+    "ldr q17, [x20, x11]\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "ldr q18, [x21, x11]\n"
+    "ldr q19, [x20, x11]\n"
+    "mov v30.16b, v23.16b\n"
+    "mov v31.16b, v23.16b\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "ldr q20, [x21, x11]\n"
+    "add %x[params], %x[params], #0x10\n"
+    "ldr q21, [x20, x11]\n"
+    "ldr x20, [x26], #0x8\n"
+    "ldr q22, [x20, x11]\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "ldp x20, x24, [x26], #0x10\n"
+    "ldp x23, x22, [x26], #0x10\n"
+    "subs x25, x25, #0x1\n"
+    "fmla v23.4s, v14.4s, v0.4s\n"
+    "ldr q14, [x20, x11]\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "fmla v24.4s, v15.4s, v0.4s\n"
+    "fmla v25.4s, v16.4s, v0.4s\n"
+    "ldr q15, [x24, x11]\n"
+    "ldr q16, [x23, x11]\n"
+    "fmla v26.4s, v17.4s, v0.4s\n"
+    "fmla v27.4s, v18.4s, v0.4s\n"
+    "ldr q17, [x22, x11]\n"
+    "ldr q18, [x21, x11]\n"
+    "fmla v28.4s, v19.4s, v0.4s\n"
+    "fmla v29.4s, v20.4s, v0.4s\n"
+    "ldr q19, [x20, x11]\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "fmla v30.4s, v21.4s, v0.4s\n"
+    "fmla v31.4s, v22.4s, v0.4s\n"
+    "ldr q0, [%x[params], #0x0]\n"
+    "ldr q20, [x21, x11]\n"
+    "add %x[params], %x[params], #0x10\n"
+    "ldr q21, [x20, x11]\n"
+    "ldr x20, [x26], #0x8\n"
+    "ldr q22, [x20, x11]\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "fmla v23.4s, v14.4s, v0.4s\n"
+    "fmla v24.4s, v15.4s, v0.4s\n"
+    "fmax v23.4s, v23.4s, v2.4s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "fmla v25.4s, v16.4s, v0.4s\n"
+    "fmla v26.4s, v17.4s, v0.4s\n"
+    "fmax v24.4s, v24.4s, v2.4s\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "fmla v27.4s, v18.4s, v0.4s\n"
+    "fmla v28.4s, v19.4s, v0.4s\n"
+    "fmax v25.4s, v25.4s, v2.4s\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "fmla v29.4s, v20.4s, v0.4s\n"
+    "fmla v30.4s, v21.4s, v0.4s\n"
+    "fmax v26.4s, v26.4s, v2.4s\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "fmla v31.4s, v22.4s, v0.4s\n"
+    "fmax v27.4s, v27.4s, v2.4s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "fmax v28.4s, v28.4s, v2.4s\n"
+    "fmax v29.4s, v29.4s, v2.4s\n"
+    "fmax v30.4s, v30.4s, v2.4s\n"
+    "fmax v31.4s, v31.4s, v2.4s\n"
+    "fmin v23.4s, v23.4s, v1.4s\n"
+    "fmin v24.4s, v24.4s, v1.4s\n"
+    "str q23, [x28, x11]\n"
+    "fmin v25.4s, v25.4s, v1.4s\n"
+    "fmin v26.4s, v26.4s, v1.4s\n"
+    "str q24, [x27, x11]\n"
+    "fmin v27.4s, v27.4s, v1.4s\n"
+    "fmin v28.4s, v28.4s, v1.4s\n"
+    "str q25, [x26, x11]\n"
+    "fmin v29.4s, v29.4s, v1.4s\n"
+    "fmin v30.4s, v30.4s, v1.4s\n"
+    "str q26, [x25, x11]\n"
+    "fmin v31.4s, v31.4s, v1.4s\n"
+    "str q27, [x24, x11]\n"
+    "str q28, [x23, x11]\n"
+    "str q29, [x22, x11]\n"
+    "str q30, [x21, x11]\n"
+    "str q31, [x20, x11]\n"
+    "add x11, x11, #0x10\n"
+    "cmp x11, x9, LSL #4\n"
+    "blt 1b\n"
+    "5:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 17f\n"
+    "movi v23.16b, #0x0\n"
+    "cbz %x[bias], 8f\n"
+    "add x20, %x[bias], x11\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v23.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v23.s }[2], [x20], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 1: Unset
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "7:"  // Oddments: Load bias: Bit 1: End
+    "8:"  // Oddments: Load bias: Done
+    "ldr q0, [%x[params], #0x0]\n"
+    "mov x10, %x[inptrs]\n"
+    "ldp x9, x28, [x10], #0x10\n"
+    "mov v24.16b, v23.16b\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "mov v25.16b, v23.16b\n"
+    "mov v26.16b, v23.16b\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "ldr x21, [x10], #0x8\n"
+    "mov v27.16b, v23.16b\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "mov v30.16b, v23.16b\n"
+    "add x9, x9, x11\n"
+    "add x28, x28, x11\n"
+    "mov v31.16b, v23.16b\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "add %x[params], %x[params], #0x10\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d14, [x9], #0x8\n"
+    "ldr d15, [x28], #0x8\n"
+    "ldr d16, [x27], #0x8\n"
+    "ldr d17, [x26], #0x8\n"
+    "ldr d18, [x25], #0x8\n"
+    "ldr d19, [x24], #0x8\n"
+    "ldr d20, [x23], #0x8\n"
+    "ldr d21, [x22], #0x8\n"
+    "ldr d22, [x21], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v14.s }[2], [x9], #0x4\n"
+    "ld1 { v15.s }[2], [x28], #0x4\n"
+    "ld1 { v16.s }[2], [x27], #0x4\n"
+    "ld1 { v17.s }[2], [x26], #0x4\n"
+    "ld1 { v18.s }[2], [x25], #0x4\n"
+    "ld1 { v19.s }[2], [x24], #0x4\n"
+    "ld1 { v20.s }[2], [x23], #0x4\n"
+    "ld1 { v21.s }[2], [x22], #0x4\n"
+    "ld1 { v22.s }[2], [x21], #0x4\n"
+    "b 10f\n"
+    "9:"  // Oddments: Load: Bit 1: Unset
+    "ldr s14, [x9], #0x4\n"
+    "ldr s15, [x28], #0x4\n"
+    "ldr s16, [x27], #0x4\n"
+    "ldr s17, [x26], #0x4\n"
+    "ldr s18, [x25], #0x4\n"
+    "ldr s19, [x24], #0x4\n"
+    "ldr s20, [x23], #0x4\n"
+    "ldr s21, [x22], #0x4\n"
+    "ldr s22, [x21], #0x4\n"
+    "10:"  // Oddments: Load: Bit 1: End
+    "subs x20, %x[n_points], #0x1\n"
+    "ble 14f\n"
+    "11:"  // Oddments: Planar loop
+    "ldp x9, x28, [x10], #0x10\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "fmla v23.4s, v14.4s, v0.4s\n"
+    "fmla v24.4s, v15.4s, v0.4s\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "fmla v25.4s, v16.4s, v0.4s\n"
+    "fmla v26.4s, v17.4s, v0.4s\n"
+    "ldr x21, [x10], #0x8\n"
+    "fmla v27.4s, v18.4s, v0.4s\n"
+    "fmla v28.4s, v19.4s, v0.4s\n"
+    "add x9, x9, x11\n"
+    "fmla v29.4s, v20.4s, v0.4s\n"
+    "fmla v30.4s, v21.4s, v0.4s\n"
+    "add x28, x28, x11\n"
+    "add x27, x27, x11\n"
+    "fmla v31.4s, v22.4s, v0.4s\n"
+    "ldr q0, [%x[params], #0x0]\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "add %x[params], %x[params], #0x10\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ldr d14, [x9], #0x8\n"
+    "ldr d15, [x28], #0x8\n"
+    "ldr d16, [x27], #0x8\n"
+    "ldr d17, [x26], #0x8\n"
+    "ldr d18, [x25], #0x8\n"
+    "ldr d19, [x24], #0x8\n"
+    "ldr d20, [x23], #0x8\n"
+    "ldr d21, [x22], #0x8\n"
+    "ldr d22, [x21], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v14.s }[2], [x9], #0x4\n"
+    "ld1 { v15.s }[2], [x28], #0x4\n"
+    "ld1 { v16.s }[2], [x27], #0x4\n"
+    "ld1 { v17.s }[2], [x26], #0x4\n"
+    "ld1 { v18.s }[2], [x25], #0x4\n"
+    "ld1 { v19.s }[2], [x24], #0x4\n"
+    "ld1 { v20.s }[2], [x23], #0x4\n"
+    "ld1 { v21.s }[2], [x22], #0x4\n"
+    "ld1 { v22.s }[2], [x21], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Planar loop: Load: Bit 1: Unset
+    "ldr s14, [x9], #0x4\n"
+    "ldr s15, [x28], #0x4\n"
+    "ldr s16, [x27], #0x4\n"
+    "ldr s17, [x26], #0x4\n"
+    "ldr s18, [x25], #0x4\n"
+    "ldr s19, [x24], #0x4\n"
+    "ldr s20, [x23], #0x4\n"
+    "ldr s21, [x22], #0x4\n"
+    "ldr s22, [x21], #0x4\n"
+    "13:"  // Oddments: Planar loop: Load: Bit 1: End
+    "subs x20, x20, #0x1\n"
+    "bgt 11b\n"
+    "14:"  // Oddments: Planar tail
+    "fmla v23.4s, v14.4s, v0.4s\n"
+    "fmla v24.4s, v15.4s, v0.4s\n"
+    "fmax v23.4s, v23.4s, v2.4s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "fmla v25.4s, v16.4s, v0.4s\n"
+    "fmla v26.4s, v17.4s, v0.4s\n"
+    "fmax v24.4s, v24.4s, v2.4s\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "fmla v27.4s, v18.4s, v0.4s\n"
+    "fmla v28.4s, v19.4s, v0.4s\n"
+    "fmax v25.4s, v25.4s, v2.4s\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "fmla v29.4s, v20.4s, v0.4s\n"
+    "fmla v30.4s, v21.4s, v0.4s\n"
+    "fmax v26.4s, v26.4s, v2.4s\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "fmla v31.4s, v22.4s, v0.4s\n"
+    "fmax v27.4s, v27.4s, v2.4s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "add x28, x28, x11\n"
+    "fmax v28.4s, v28.4s, v2.4s\n"
+    "fmax v29.4s, v29.4s, v2.4s\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "fmax v30.4s, v30.4s, v2.4s\n"
+    "fmax v31.4s, v31.4s, v2.4s\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "fmin v23.4s, v23.4s, v1.4s\n"
+    "fmin v24.4s, v24.4s, v1.4s\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "fmin v25.4s, v25.4s, v1.4s\n"
+    "fmin v26.4s, v26.4s, v1.4s\n"
+    "add x21, x21, x11\n"
+    "add x20, x20, x11\n"
+    "fmin v27.4s, v27.4s, v1.4s\n"
+    "fmin v28.4s, v28.4s, v1.4s\n"
+    "fmin v29.4s, v29.4s, v1.4s\n"
+    "fmin v30.4s, v30.4s, v1.4s\n"
+    "fmin v31.4s, v31.4s, v1.4s\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "st1 { v23.d }[0], [x28], #0x8\n"
+    "st1 { v24.d }[0], [x27], #0x8\n"
+    "st1 { v25.d }[0], [x26], #0x8\n"
+    "st1 { v26.d }[0], [x25], #0x8\n"
+    "st1 { v27.d }[0], [x24], #0x8\n"
+    "st1 { v28.d }[0], [x23], #0x8\n"
+    "st1 { v29.d }[0], [x22], #0x8\n"
+    "st1 { v30.d }[0], [x21], #0x8\n"
+    "st1 { v31.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "st1 { v23.s }[2], [x28], #0x4\n"
+    "st1 { v24.s }[2], [x27], #0x4\n"
+    "st1 { v25.s }[2], [x26], #0x4\n"
+    "st1 { v26.s }[2], [x25], #0x4\n"
+    "st1 { v27.s }[2], [x24], #0x4\n"
+    "st1 { v28.s }[2], [x23], #0x4\n"
+    "st1 { v29.s }[2], [x22], #0x4\n"
+    "st1 { v30.s }[2], [x21], #0x4\n"
+    "st1 { v31.s }[2], [x20], #0x4\n"
+    "b 16f\n"
+    "15:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v23.s }[0], [x28], #0x4\n"
+    "st1 { v24.s }[0], [x27], #0x4\n"
+    "st1 { v25.s }[0], [x26], #0x4\n"
+    "st1 { v26.s }[0], [x25], #0x4\n"
+    "st1 { v27.s }[0], [x24], #0x4\n"
+    "st1 { v28.s }[0], [x23], #0x4\n"
+    "st1 { v29.s }[0], [x22], #0x4\n"
+    "st1 { v30.s }[0], [x21], #0x4\n"
+    "st1 { v31.s }[0], [x20], #0x4\n"
+    "16:"  // Oddments: Store: Bit 1: End
+    "17:"  // End
+    : [params] "+&r" (params)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6c07fa645c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst : DepthfirstMultiplierStrategy<float, float, float, float>
+{
+  using Parent = DepthfirstMultiplierStrategy<float, float, float, float>;
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *)
+  : Parent(3, 3, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..9cafd23fb8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ld1r { v27.4s }, [%x[clamps]]\n"
+    "ldr x21, [%x[inptrs], #0x0]\n"
+    "lsr x22, %x[channel_multiplier], #0x2\n"
+    "add x20, %x[clamps], #0x4\n"
+    "ldr q0, [x21, #0x0]\n"
+    "ldr q1, [x21, #0x10]\n"
+    "mov x21, #0x0\n"
+    "mov x14, #0x0\n"
+    "ld1r { v26.4s }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "ldr q2, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "ldr q4, [x20, #0x0]\n"
+    "ldr q5, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "ldr q6, [x20, #0x0]\n"
+    "ldr q7, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "ldr q8, [x20, #0x0]\n"
+    "ldr q9, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x28]\n"
+    "ldr q10, [x20, #0x0]\n"
+    "ldr q11, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x30]\n"
+    "ldr q12, [x20, #0x0]\n"
+    "ldr q13, [x20, #0x10]\n"
+    "ldp x13, x12, [%x[outptrs], #0x0]\n"
+    "ldp x11, x10, [%x[outptrs], #0x10]\n"
+    "ldp x9, x28, [%x[outptrs], #0x20]\n"
+    "ldp x27, x26, [%x[outptrs], #0x30]\n"
+    "ldr x25, [%x[outptrs], #0x40]\n"
+    "cbz x22, 3f\n"
+    "ldr q14, [%x[params], #0x0]\n"
+    "ldr q31, [%x[params], #0x10]\n"
+    "subs x22, x22, #0x1\n"
+    "mov v15.16b, v14.16b\n"
+    "ldr q30, [%x[params], #0x20]\n"
+    "ldr q29, [%x[params], #0x30]\n"
+    "mov v16.16b, v14.16b\n"
+    "mov v17.16b, v14.16b\n"
+    "mov v18.16b, v14.16b\n"
+    "mov v19.16b, v14.16b\n"
+    "add %x[params], %x[params], #0x40\n"
+    "mov v20.16b, v14.16b\n"
+    "mov v21.16b, v14.16b\n"
+    "mov v22.16b, v14.16b\n"
+    "beq 2f\n"
+    "1:"  // Output channel complete vector loop
+    "fmla v14.4s, v31.4s, v0.s[0]\n"
+    "fmla v15.4s, v31.4s, v0.s[2]\n"
+    "subs x22, x22, #0x1\n"
+    "add x21, x21, #0x4\n"
+    "fmla v16.4s, v31.4s, v1.s[0]\n"
+    "fmla v17.4s, v31.4s, v4.s[0]\n"
+    "fmla v18.4s, v31.4s, v4.s[2]\n"
+    "fmla v19.4s, v31.4s, v5.s[0]\n"
+    "fmla v20.4s, v31.4s, v8.s[0]\n"
+    "fmla v21.4s, v31.4s, v8.s[2]\n"
+    "fmla v22.4s, v31.4s, v9.s[0]\n"
+    "ldr q25, [%x[params], #0x0]\n"
+    "fmla v14.4s, v30.4s, v0.s[1]\n"
+    "fmla v15.4s, v30.4s, v0.s[3]\n"
+    "fmla v16.4s, v30.4s, v1.s[1]\n"
+    "fmla v17.4s, v30.4s, v4.s[1]\n"
+    "fmla v18.4s, v30.4s, v4.s[3]\n"
+    "fmla v19.4s, v30.4s, v5.s[1]\n"
+    "fmla v20.4s, v30.4s, v8.s[1]\n"
+    "fmla v21.4s, v30.4s, v8.s[3]\n"
+    "fmla v22.4s, v30.4s, v9.s[1]\n"
+    "ldr q24, [%x[params], #0x10]\n"
+    "fmla v14.4s, v29.4s, v0.s[2]\n"
+    "fmla v15.4s, v29.4s, v1.s[0]\n"
+    "fmla v16.4s, v29.4s, v1.s[2]\n"
+    "fmla v17.4s, v29.4s, v4.s[2]\n"
+    "fmla v18.4s, v29.4s, v5.s[0]\n"
+    "fmla v19.4s, v29.4s, v5.s[2]\n"
+    "fmla v20.4s, v29.4s, v8.s[2]\n"
+    "fmla v21.4s, v29.4s, v9.s[0]\n"
+    "fmla v22.4s, v29.4s, v9.s[2]\n"
+    "ldr q23, [%x[params], #0x20]\n"
+    "fmla v14.4s, v25.4s, v2.s[0]\n"
+    "fmla v15.4s, v25.4s, v2.s[2]\n"
+    "fmla v16.4s, v25.4s, v3.s[0]\n"
+    "fmla v17.4s, v25.4s, v6.s[0]\n"
+    "fmla v18.4s, v25.4s, v6.s[2]\n"
+    "fmla v19.4s, v25.4s, v7.s[0]\n"
+    "fmla v20.4s, v25.4s, v10.s[0]\n"
+    "fmla v21.4s, v25.4s, v10.s[2]\n"
+    "fmla v22.4s, v25.4s, v11.s[0]\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    "fmla v14.4s, v24.4s, v2.s[1]\n"
+    "fmla v15.4s, v24.4s, v2.s[3]\n"
+    "fmla v16.4s, v24.4s, v3.s[1]\n"
+    "fmla v17.4s, v24.4s, v6.s[1]\n"
+    "fmla v18.4s, v24.4s, v6.s[3]\n"
+    "fmla v19.4s, v24.4s, v7.s[1]\n"
+    "fmla v20.4s, v24.4s, v10.s[1]\n"
+    "fmla v21.4s, v24.4s, v10.s[3]\n"
+    "fmla v22.4s, v24.4s, v11.s[1]\n"
+    "ldr q24, [%x[params], #0x40]\n"
+    "fmla v14.4s, v23.4s, v2.s[2]\n"
+    "fmla v15.4s, v23.4s, v3.s[0]\n"
+    "fmla v16.4s, v23.4s, v3.s[2]\n"
+    "fmla v17.4s, v23.4s, v6.s[2]\n"
+    "fmla v18.4s, v23.4s, v7.s[0]\n"
+    "fmla v19.4s, v23.4s, v7.s[2]\n"
+    "fmla v20.4s, v23.4s, v10.s[2]\n"
+    "fmla v21.4s, v23.4s, v11.s[0]\n"
+    "fmla v22.4s, v23.4s, v11.s[2]\n"
+    "ldr q23, [%x[params], #0x50]\n"
+    "fmla v14.4s, v25.4s, v4.s[0]\n"
+    "fmla v15.4s, v25.4s, v4.s[2]\n"
+    "fmla v16.4s, v25.4s, v5.s[0]\n"
+    "fmla v17.4s, v25.4s, v8.s[0]\n"
+    "fmla v18.4s, v25.4s, v8.s[2]\n"
+    "fmla v19.4s, v25.4s, v9.s[0]\n"
+    "fmla v20.4s, v25.4s, v12.s[0]\n"
+    "fmla v21.4s, v25.4s, v12.s[2]\n"
+    "fmla v22.4s, v25.4s, v13.s[0]\n"
+    "ldr q31, [%x[params], #0x70]\n"
+    "fmla v14.4s, v24.4s, v4.s[1]\n"
+    "fmla v15.4s, v24.4s, v4.s[3]\n"
+    "fmla v16.4s, v24.4s, v5.s[1]\n"
+    "fmla v17.4s, v24.4s, v8.s[1]\n"
+    "fmla v18.4s, v24.4s, v8.s[3]\n"
+    "fmla v19.4s, v24.4s, v9.s[1]\n"
+    "fmla v20.4s, v24.4s, v12.s[1]\n"
+    "fmla v21.4s, v24.4s, v12.s[3]\n"
+    "fmla v22.4s, v24.4s, v13.s[1]\n"
+    "ldr q30, [%x[params], #0x80]\n"
+    "fmla v14.4s, v23.4s, v4.s[2]\n"
+    "fmla v15.4s, v23.4s, v5.s[0]\n"
+    "fmin v14.4s, v14.4s, v26.4s\n"
+    "fmla v16.4s, v23.4s, v5.s[2]\n"
+    "fmla v17.4s, v23.4s, v8.s[2]\n"
+    "fmax v14.4s, v14.4s, v27.4s\n"
+    "str q14, [x13, x14]\n"
+    "ldr q14, [%x[params], #0x60]\n"
+    "fmla v18.4s, v23.4s, v9.s[0]\n"
+    "fmla v19.4s, v23.4s, v9.s[2]\n"
+    "fmin v15.4s, v15.4s, v26.4s\n"
+    "fmla v20.4s, v23.4s, v12.s[2]\n"
+    "fmla v21.4s, v23.4s, v13.s[0]\n"
+    "fmin v16.4s, v16.4s, v26.4s\n"
+    "fmla v22.4s, v23.4s, v13.s[2]\n"
+    "ldr q29, [%x[params], #0x90]\n"
+    "fmin v17.4s, v17.4s, v26.4s\n"
+    "add %x[params], %x[params], #0xa0\n"
+    "fmin v18.4s, v18.4s, v26.4s\n"
+    "fmin v19.4s, v19.4s, v26.4s\n"
+    "fmin v20.4s, v20.4s, v26.4s\n"
+    "fmin v21.4s, v21.4s, v26.4s\n"
+    "fmin v22.4s, v22.4s, v26.4s\n"
+    "fmax v15.4s, v15.4s, v27.4s\n"
+    "str q15, [x12, x14]\n"
+    "fmax v16.4s, v16.4s, v27.4s\n"
+    "fmax v17.4s, v17.4s, v27.4s\n"
+    "str q16, [x11, x14]\n"
+    "fmax v18.4s, v18.4s, v27.4s\n"
+    "fmax v19.4s, v19.4s, v27.4s\n"
+    "str q17, [x10, x14]\n"
+    "fmax v20.4s, v20.4s, v27.4s\n"
+    "fmax v21.4s, v21.4s, v27.4s\n"
+    "str q18, [x9, x14]\n"
+    "fmax v22.4s, v22.4s, v27.4s\n"
+    "str q19, [x28, x14]\n"
+    "mov v15.16b, v14.16b\n"
+    "str q20, [x27, x14]\n"
+    "mov v16.16b, v14.16b\n"
+    "mov v17.16b, v14.16b\n"
+    "str q21, [x26, x14]\n"
+    "mov v18.16b, v14.16b\n"
+    "mov v19.16b, v14.16b\n"
+    "str q22, [x25, x14]\n"
+    "mov v20.16b, v14.16b\n"
+    "mov v21.16b, v14.16b\n"
+    "add x14, x14, #0x10\n"
+    "mov v22.16b, v14.16b\n"
+    "bgt 1b\n"
+    "2:"  // Output channel complete vector tail
+    "fmla v14.4s, v31.4s, v0.s[0]\n"
+    "fmla v15.4s, v31.4s, v0.s[2]\n"
+    "fmla v16.4s, v31.4s, v1.s[0]\n"
+    "fmla v17.4s, v31.4s, v4.s[0]\n"
+    "fmla v18.4s, v31.4s, v4.s[2]\n"
+    "fmla v19.4s, v31.4s, v5.s[0]\n"
+    "fmla v20.4s, v31.4s, v8.s[0]\n"
+    "fmla v21.4s, v31.4s, v8.s[2]\n"
+    "fmla v22.4s, v31.4s, v9.s[0]\n"
+    "ldr q25, [%x[params], #0x0]\n"
+    "fmla v14.4s, v30.4s, v0.s[1]\n"
+    "fmla v15.4s, v30.4s, v0.s[3]\n"
+    "fmla v16.4s, v30.4s, v1.s[1]\n"
+    "fmla v17.4s, v30.4s, v4.s[1]\n"
+    "fmla v18.4s, v30.4s, v4.s[3]\n"
+    "fmla v19.4s, v30.4s, v5.s[1]\n"
+    "fmla v20.4s, v30.4s, v8.s[1]\n"
+    "fmla v21.4s, v30.4s, v8.s[3]\n"
+    "fmla v22.4s, v30.4s, v9.s[1]\n"
+    "ldr q24, [%x[params], #0x10]\n"
+    "fmla v14.4s, v29.4s, v0.s[2]\n"
+    "fmla v15.4s, v29.4s, v1.s[0]\n"
+    "fmla v16.4s, v29.4s, v1.s[2]\n"
+    "fmla v17.4s, v29.4s, v4.s[2]\n"
+    "fmla v18.4s, v29.4s, v5.s[0]\n"
+    "fmla v19.4s, v29.4s, v5.s[2]\n"
+    "fmla v20.4s, v29.4s, v8.s[2]\n"
+    "fmla v21.4s, v29.4s, v9.s[0]\n"
+    "fmla v22.4s, v29.4s, v9.s[2]\n"
+    "ldr q23, [%x[params], #0x20]\n"
+    "fmla v14.4s, v25.4s, v2.s[0]\n"
+    "fmla v15.4s, v25.4s, v2.s[2]\n"
+    "fmla v16.4s, v25.4s, v3.s[0]\n"
+    "fmla v17.4s, v25.4s, v6.s[0]\n"
+    "fmla v18.4s, v25.4s, v6.s[2]\n"
+    "fmla v19.4s, v25.4s, v7.s[0]\n"
+    "fmla v20.4s, v25.4s, v10.s[0]\n"
+    "fmla v21.4s, v25.4s, v10.s[2]\n"
+    "fmla v22.4s, v25.4s, v11.s[0]\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    "fmla v14.4s, v24.4s, v2.s[1]\n"
+    "fmla v15.4s, v24.4s, v2.s[3]\n"
+    "fmla v16.4s, v24.4s, v3.s[1]\n"
+    "fmla v17.4s, v24.4s, v6.s[1]\n"
+    "fmla v18.4s, v24.4s, v6.s[3]\n"
+    "fmla v19.4s, v24.4s, v7.s[1]\n"
+    "fmla v20.4s, v24.4s, v10.s[1]\n"
+    "fmla v21.4s, v24.4s, v10.s[3]\n"
+    "fmla v22.4s, v24.4s, v11.s[1]\n"
+    "ldr q24, [%x[params], #0x40]\n"
+    "fmla v14.4s, v23.4s, v2.s[2]\n"
+    "fmla v15.4s, v23.4s, v3.s[0]\n"
+    "fmla v16.4s, v23.4s, v3.s[2]\n"
+    "fmla v17.4s, v23.4s, v6.s[2]\n"
+    "fmla v18.4s, v23.4s, v7.s[0]\n"
+    "fmla v19.4s, v23.4s, v7.s[2]\n"
+    "fmla v20.4s, v23.4s, v10.s[2]\n"
+    "fmla v21.4s, v23.4s, v11.s[0]\n"
+    "fmla v22.4s, v23.4s, v11.s[2]\n"
+    "ldr q23, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x60\n"
+    "fmla v14.4s, v25.4s, v4.s[0]\n"
+    "fmla v15.4s, v25.4s, v4.s[2]\n"
+    "fmla v16.4s, v25.4s, v5.s[0]\n"
+    "fmla v17.4s, v25.4s, v8.s[0]\n"
+    "fmla v18.4s, v25.4s, v8.s[2]\n"
+    "fmla v19.4s, v25.4s, v9.s[0]\n"
+    "fmla v20.4s, v25.4s, v12.s[0]\n"
+    "fmla v21.4s, v25.4s, v12.s[2]\n"
+    "fmla v22.4s, v25.4s, v13.s[0]\n"
+    "fmla v14.4s, v24.4s, v4.s[1]\n"
+    "fmla v15.4s, v24.4s, v4.s[3]\n"
+    "fmla v16.4s, v24.4s, v5.s[1]\n"
+    "fmla v17.4s, v24.4s, v8.s[1]\n"
+    "fmla v18.4s, v24.4s, v8.s[3]\n"
+    "fmla v19.4s, v24.4s, v9.s[1]\n"
+    "fmla v20.4s, v24.4s, v12.s[1]\n"
+    "fmla v21.4s, v24.4s, v12.s[3]\n"
+    "fmla v22.4s, v24.4s, v13.s[1]\n"
+    "fmla v14.4s, v23.4s, v4.s[2]\n"
+    "fmla v15.4s, v23.4s, v5.s[0]\n"
+    "fmin v14.4s, v14.4s, v26.4s\n"
+    "fmla v16.4s, v23.4s, v5.s[2]\n"
+    "fmla v17.4s, v23.4s, v8.s[2]\n"
+    "fmin v15.4s, v15.4s, v26.4s\n"
+    "fmla v18.4s, v23.4s, v9.s[0]\n"
+    "fmla v19.4s, v23.4s, v9.s[2]\n"
+    "fmin v16.4s, v16.4s, v26.4s\n"
+    "fmla v20.4s, v23.4s, v12.s[2]\n"
+    "fmla v21.4s, v23.4s, v13.s[0]\n"
+    "fmin v17.4s, v17.4s, v26.4s\n"
+    "fmla v22.4s, v23.4s, v13.s[2]\n"
+    "fmin v18.4s, v18.4s, v26.4s\n"
+    "fmin v19.4s, v19.4s, v26.4s\n"
+    "fmin v20.4s, v20.4s, v26.4s\n"
+    "fmin v21.4s, v21.4s, v26.4s\n"
+    "fmin v22.4s, v22.4s, v26.4s\n"
+    "fmax v14.4s, v14.4s, v27.4s\n"
+    "fmax v15.4s, v15.4s, v27.4s\n"
+    "str q14, [x13, x14]\n"
+    "fmax v16.4s, v16.4s, v27.4s\n"
+    "fmax v17.4s, v17.4s, v27.4s\n"
+    "str q15, [x12, x14]\n"
+    "fmax v18.4s, v18.4s, v27.4s\n"
+    "fmax v19.4s, v19.4s, v27.4s\n"
+    "str q16, [x11, x14]\n"
+    "fmax v20.4s, v20.4s, v27.4s\n"
+    "fmax v21.4s, v21.4s, v27.4s\n"
+    "str q17, [x10, x14]\n"
+    "fmax v22.4s, v22.4s, v27.4s\n"
+    "str q18, [x9, x14]\n"
+    "str q19, [x28, x14]\n"
+    "str q20, [x27, x14]\n"
+    "str q21, [x26, x14]\n"
+    "str q22, [x25, x14]\n"
+    "add x14, x14, #0x10\n"
+    "3:"  // Output channel oddments
+    "tst %x[channel_multiplier], #0x3\n"
+    "beq 6f\n"
+    "ldr q14, [%x[params], #0x0]\n"
+    "ldr q25, [%x[params], #0x10]\n"
+    "mov v15.16b, v14.16b\n"
+    "mov v16.16b, v14.16b\n"
+    "ldr q24, [%x[params], #0x20]\n"
+    "ldr q23, [%x[params], #0x30]\n"
+    "mov v17.16b, v14.16b\n"
+    "mov v18.16b, v14.16b\n"
+    "mov v19.16b, v14.16b\n"
+    "mov v20.16b, v14.16b\n"
+    "fmla v15.4s, v25.4s, v0.s[2]\n"
+    "mov v21.16b, v14.16b\n"
+    "mov v22.16b, v14.16b\n"
+    "fmla v14.4s, v25.4s, v0.s[0]\n"
+    "fmla v16.4s, v25.4s, v1.s[0]\n"
+    "fmla v17.4s, v25.4s, v4.s[0]\n"
+    "fmla v18.4s, v25.4s, v4.s[2]\n"
+    "fmla v19.4s, v25.4s, v5.s[0]\n"
+    "fmla v20.4s, v25.4s, v8.s[0]\n"
+    "fmla v21.4s, v25.4s, v8.s[2]\n"
+    "fmla v22.4s, v25.4s, v9.s[0]\n"
+    "ldr q25, [%x[params], #0x40]\n"
+    "fmla v14.4s, v24.4s, v0.s[1]\n"
+    "fmla v15.4s, v24.4s, v0.s[3]\n"
+    "fmla v16.4s, v24.4s, v1.s[1]\n"
+    "fmla v17.4s, v24.4s, v4.s[1]\n"
+    "fmla v18.4s, v24.4s, v4.s[3]\n"
+    "fmla v19.4s, v24.4s, v5.s[1]\n"
+    "fmla v20.4s, v24.4s, v8.s[1]\n"
+    "fmla v21.4s, v24.4s, v8.s[3]\n"
+    "fmla v22.4s, v24.4s, v9.s[1]\n"
+    "ldr q24, [%x[params], #0x50]\n"
+    "fmla v14.4s, v23.4s, v0.s[2]\n"
+    "fmla v15.4s, v23.4s, v1.s[0]\n"
+    "fmla v16.4s, v23.4s, v1.s[2]\n"
+    "fmla v17.4s, v23.4s, v4.s[2]\n"
+    "fmla v18.4s, v23.4s, v5.s[0]\n"
+    "fmla v19.4s, v23.4s, v5.s[2]\n"
+    "fmla v20.4s, v23.4s, v8.s[2]\n"
+    "fmla v21.4s, v23.4s, v9.s[0]\n"
+    "fmla v22.4s, v23.4s, v9.s[2]\n"
+    "ldr q23, [%x[params], #0x60]\n"
+    "fmla v14.4s, v25.4s, v2.s[0]\n"
+    "fmla v15.4s, v25.4s, v2.s[2]\n"
+    "fmla v16.4s, v25.4s, v3.s[0]\n"
+    "fmla v17.4s, v25.4s, v6.s[0]\n"
+    "fmla v18.4s, v25.4s, v6.s[2]\n"
+    "fmla v19.4s, v25.4s, v7.s[0]\n"
+    "fmla v20.4s, v25.4s, v10.s[0]\n"
+    "fmla v21.4s, v25.4s, v10.s[2]\n"
+    "fmla v22.4s, v25.4s, v11.s[0]\n"
+    "ldr q25, [%x[params], #0x70]\n"
+    "fmla v14.4s, v24.4s, v2.s[1]\n"
+    "fmla v15.4s, v24.4s, v2.s[3]\n"
+    "fmla v16.4s, v24.4s, v3.s[1]\n"
+    "fmla v17.4s, v24.4s, v6.s[1]\n"
+    "fmla v18.4s, v24.4s, v6.s[3]\n"
+    "fmla v19.4s, v24.4s, v7.s[1]\n"
+    "fmla v20.4s, v24.4s, v10.s[1]\n"
+    "fmla v21.4s, v24.4s, v10.s[3]\n"
+    "fmla v22.4s, v24.4s, v11.s[1]\n"
+    "ldr q24, [%x[params], #0x80]\n"
+    "fmla v14.4s, v23.4s, v2.s[2]\n"
+    "fmla v15.4s, v23.4s, v3.s[0]\n"
+    "fmla v16.4s, v23.4s, v3.s[2]\n"
+    "fmla v17.4s, v23.4s, v6.s[2]\n"
+    "fmla v18.4s, v23.4s, v7.s[0]\n"
+    "fmla v19.4s, v23.4s, v7.s[2]\n"
+    "fmla v20.4s, v23.4s, v10.s[2]\n"
+    "fmla v21.4s, v23.4s, v11.s[0]\n"
+    "fmla v22.4s, v23.4s, v11.s[2]\n"
+    "ldr q23, [%x[params], #0x90]\n"
+    "add %x[params], %x[params], #0xa0\n"
+    "fmla v14.4s, v25.4s, v4.s[0]\n"
+    "fmla v15.4s, v25.4s, v4.s[2]\n"
+    "fmla v16.4s, v25.4s, v5.s[0]\n"
+    "fmla v17.4s, v25.4s, v8.s[0]\n"
+    "fmla v18.4s, v25.4s, v8.s[2]\n"
+    "fmla v19.4s, v25.4s, v9.s[0]\n"
+    "fmla v20.4s, v25.4s, v12.s[0]\n"
+    "fmla v21.4s, v25.4s, v12.s[2]\n"
+    "fmla v22.4s, v25.4s, v13.s[0]\n"
+    "fmla v14.4s, v24.4s, v4.s[1]\n"
+    "fmla v15.4s, v24.4s, v4.s[3]\n"
+    "fmla v16.4s, v24.4s, v5.s[1]\n"
+    "fmla v17.4s, v24.4s, v8.s[1]\n"
+    "fmla v18.4s, v24.4s, v8.s[3]\n"
+    "fmla v19.4s, v24.4s, v9.s[1]\n"
+    "fmla v20.4s, v24.4s, v12.s[1]\n"
+    "fmla v21.4s, v24.4s, v12.s[3]\n"
+    "fmla v22.4s, v24.4s, v13.s[1]\n"
+    "fmla v14.4s, v23.4s, v4.s[2]\n"
+    "fmla v15.4s, v23.4s, v5.s[0]\n"
+    "fmin v14.4s, v14.4s, v26.4s\n"
+    "fmla v16.4s, v23.4s, v5.s[2]\n"
+    "fmla v17.4s, v23.4s, v8.s[2]\n"
+    "fmin v15.4s, v15.4s, v26.4s\n"
+    "fmla v18.4s, v23.4s, v9.s[0]\n"
+    "fmla v19.4s, v23.4s, v9.s[2]\n"
+    "fmin v16.4s, v16.4s, v26.4s\n"
+    "fmla v20.4s, v23.4s, v12.s[2]\n"
+    "fmla v21.4s, v23.4s, v13.s[0]\n"
+    "fmin v17.4s, v17.4s, v26.4s\n"
+    "fmla v22.4s, v23.4s, v13.s[2]\n"
+    "fmin v18.4s, v18.4s, v26.4s\n"
+    "fmin v19.4s, v19.4s, v26.4s\n"
+    "fmin v20.4s, v20.4s, v26.4s\n"
+    "fmin v21.4s, v21.4s, v26.4s\n"
+    "fmin v22.4s, v22.4s, v26.4s\n"
+    "fmax v14.4s, v14.4s, v27.4s\n"
+    "fmax v15.4s, v15.4s, v27.4s\n"
+    "fmax v16.4s, v16.4s, v27.4s\n"
+    "fmax v17.4s, v17.4s, v27.4s\n"
+    "fmax v18.4s, v18.4s, v27.4s\n"
+    "fmax v19.4s, v19.4s, v27.4s\n"
+    "fmax v20.4s, v20.4s, v27.4s\n"
+    "fmax v21.4s, v21.4s, v27.4s\n"
+    "fmax v22.4s, v22.4s, v27.4s\n"
+    "tbz %x[channel_multiplier], #1, 4f\n"
+    "add x20, x13, x14\n"
+    "add x22, x12, x14\n"
+    "st1 { v14.d }[0], [x20]\n"
+    "add x21, x11, x14\n"
+    "add x20, x10, x14\n"
+    "st1 { v15.d }[0], [x22]\n"
+    "add x24, x9, x14\n"
+    "add x23, x28, x14\n"
+    "st1 { v16.d }[0], [x21]\n"
+    "add x22, x27, x14\n"
+    "add x21, x26, x14\n"
+    "st1 { v17.d }[0], [x20]\n"
+    "add x20, x25, x14\n"
+    "st1 { v18.d }[0], [x24]\n"
+    "add x14, x14, #0x8\n"
+    "st1 { v19.d }[0], [x23]\n"
+    "st1 { v20.d }[0], [x22]\n"
+    "st1 { v21.d }[0], [x21]\n"
+    "st1 { v22.d }[0], [x20]\n"
+    "tbz %x[channel_multiplier], #0, 5f\n"
+    "add x20, x13, x14\n"
+    "add x22, x12, x14\n"
+    "st1 { v14.s }[2], [x20]\n"
+    "add x21, x11, x14\n"
+    "add x20, x10, x14\n"
+    "st1 { v15.s }[2], [x22]\n"
+    "add x24, x9, x14\n"
+    "add x23, x28, x14\n"
+    "st1 { v16.s }[2], [x21]\n"
+    "add x22, x27, x14\n"
+    "add x21, x26, x14\n"
+    "st1 { v17.s }[2], [x20]\n"
+    "add x20, x25, x14\n"
+    "st1 { v18.s }[2], [x24]\n"
+    "st1 { v19.s }[2], [x23]\n"
+    "st1 { v20.s }[2], [x22]\n"
+    "st1 { v21.s }[2], [x21]\n"
+    "st1 { v22.s }[2], [x20]\n"
+    "b 5f\n"
+    "4:"  // Output channel oddments: Store: Bit 1: Unset
+    "add x20, x13, x14\n"
+    "add x22, x12, x14\n"
+    "st1 { v14.s }[0], [x20]\n"
+    "add x21, x11, x14\n"
+    "add x20, x10, x14\n"
+    "st1 { v15.s }[0], [x22]\n"
+    "add x24, x9, x14\n"
+    "add x23, x28, x14\n"
+    "st1 { v16.s }[0], [x21]\n"
+    "add x22, x27, x14\n"
+    "add x21, x26, x14\n"
+    "st1 { v17.s }[0], [x20]\n"
+    "add x20, x25, x14\n"
+    "st1 { v18.s }[0], [x24]\n"
+    "st1 { v19.s }[0], [x23]\n"
+    "st1 { v20.s }[0], [x22]\n"
+    "st1 { v21.s }[0], [x21]\n"
+    "st1 { v22.s }[0], [x20]\n"
+    "5:"  // Output channel oddments: Store: Bit 1: End
+    "6:"  // End
+    : [params] "+&r" (params)
+    : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..9f514c78e7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst : DepthfirstMultiplierStrategy<float, float, float, float>
+{
+  using Parent = DepthfirstMultiplierStrategy<float, float, float, float>;
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *)
+  : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c9bb1f41da
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
@@ -0,0 +1,917 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ld1r { v26.4s }, [%x[clamps]]\n"
+    "ldr x21, [%x[inptrs], #0x0]\n"
+    "lsr x22, %x[channel_multiplier], #0x2\n"
+    "add x20, %x[clamps], #0x4\n"
+    "ldr q0, [x21, #0x0]\n"
+    "ldr q1, [x21, #0x10]\n"
+    "mov x21, #0x0\n"
+    "mov x13, #0x0\n"
+    "ld1r { v25.4s }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "ldr q2, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "ldr q4, [x20, #0x0]\n"
+    "ldr q5, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "ldr q6, [x20, #0x0]\n"
+    "ldr q7, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "ldr q8, [x20, #0x0]\n"
+    "ldr q9, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x28]\n"
+    "ldr q10, [x20, #0x0]\n"
+    "ldr q11, [x20, #0x10]\n"
+    "ldp x12, x11, [%x[outptrs], #0x0]\n"
+    "ldp x10, x9, [%x[outptrs], #0x10]\n"
+    "ldp x28, x27, [%x[outptrs], #0x20]\n"
+    "ldp x26, x25, [%x[outptrs], #0x30]\n"
+    "cbz x22, 3f\n"
+    "ldr q12, [%x[params], #0x0]\n"
+    "ldr q31, [%x[params], #0x10]\n"
+    "subs x22, x22, #0x1\n"
+    "mov v13.16b, v12.16b\n"
+    "ldr q30, [%x[params], #0x20]\n"
+    "ldr q29, [%x[params], #0x30]\n"
+    "mov v14.16b, v12.16b\n"
+    "mov v15.16b, v12.16b\n"
+    "ldr q28, [%x[params], #0x40]\n"
+    "ldr q27, [%x[params], #0x50]\n"
+    "mov v16.16b, v12.16b\n"
+    "mov v17.16b, v12.16b\n"
+    "mov v18.16b, v12.16b\n"
+    "mov v19.16b, v12.16b\n"
+    "add %x[params], %x[params], #0x60\n"
+    "beq 2f\n"
+    "1:"  // Output channel complete vector loop
+    "fmla v12.4s, v31.4s, v0.s[0]\n"
+    "fmla v13.4s, v31.4s, v0.s[1]\n"
+    "subs x22, x22, #0x1\n"
+    "add x21, x21, #0x4\n"
+    "fmla v14.4s, v31.4s, v0.s[2]\n"
+    "fmla v15.4s, v31.4s, v0.s[3]\n"
+    "fmla v16.4s, v31.4s, v2.s[0]\n"
+    "fmla v17.4s, v31.4s, v2.s[1]\n"
+    "fmla v18.4s, v31.4s, v2.s[2]\n"
+    "fmla v19.4s, v31.4s, v2.s[3]\n"
+    "ldr q24, [%x[params], #0x0]\n"
+    "fmla v12.4s, v30.4s, v0.s[1]\n"
+    "fmla v13.4s, v30.4s, v0.s[2]\n"
+    "fmla v14.4s, v30.4s, v0.s[3]\n"
+    "fmla v15.4s, v30.4s, v1.s[0]\n"
+    "fmla v16.4s, v30.4s, v2.s[1]\n"
+    "fmla v17.4s, v30.4s, v2.s[2]\n"
+    "fmla v18.4s, v30.4s, v2.s[3]\n"
+    "fmla v19.4s, v30.4s, v3.s[0]\n"
+    "ldr q23, [%x[params], #0x10]\n"
+    "fmla v12.4s, v29.4s, v0.s[2]\n"
+    "fmla v13.4s, v29.4s, v0.s[3]\n"
+    "fmla v14.4s, v29.4s, v1.s[0]\n"
+    "fmla v15.4s, v29.4s, v1.s[1]\n"
+    "fmla v16.4s, v29.4s, v2.s[2]\n"
+    "fmla v17.4s, v29.4s, v2.s[3]\n"
+    "fmla v18.4s, v29.4s, v3.s[0]\n"
+    "fmla v19.4s, v29.4s, v3.s[1]\n"
+    "ldr q22, [%x[params], #0x20]\n"
+    "fmla v12.4s, v28.4s, v0.s[3]\n"
+    "fmla v13.4s, v28.4s, v1.s[0]\n"
+    "fmla v14.4s, v28.4s, v1.s[1]\n"
+    "fmla v15.4s, v28.4s, v1.s[2]\n"
+    "fmla v16.4s, v28.4s, v2.s[3]\n"
+    "fmla v17.4s, v28.4s, v3.s[0]\n"
+    "fmla v18.4s, v28.4s, v3.s[1]\n"
+    "fmla v19.4s, v28.4s, v3.s[2]\n"
+    "ldr q21, [%x[params], #0x30]\n"
+    "fmla v12.4s, v27.4s, v1.s[0]\n"
+    "fmla v13.4s, v27.4s, v1.s[1]\n"
+    "fmla v14.4s, v27.4s, v1.s[2]\n"
+    "fmla v15.4s, v27.4s, v1.s[3]\n"
+    "fmla v16.4s, v27.4s, v3.s[0]\n"
+    "fmla v17.4s, v27.4s, v3.s[1]\n"
+    "fmla v18.4s, v27.4s, v3.s[2]\n"
+    "fmla v19.4s, v27.4s, v3.s[3]\n"
+    "ldr q20, [%x[params], #0x40]\n"
+    "fmla v12.4s, v24.4s, v2.s[0]\n"
+    "fmla v13.4s, v24.4s, v2.s[1]\n"
+    "fmla v14.4s, v24.4s, v2.s[2]\n"
+    "fmla v15.4s, v24.4s, v2.s[3]\n"
+    "fmla v16.4s, v24.4s, v4.s[0]\n"
+    "fmla v17.4s, v24.4s, v4.s[1]\n"
+    "fmla v18.4s, v24.4s, v4.s[2]\n"
+    "fmla v19.4s, v24.4s, v4.s[3]\n"
+    "ldr q24, [%x[params], #0x50]\n"
+    "fmla v12.4s, v23.4s, v2.s[1]\n"
+    "fmla v13.4s, v23.4s, v2.s[2]\n"
+    "fmla v14.4s, v23.4s, v2.s[3]\n"
+    "fmla v15.4s, v23.4s, v3.s[0]\n"
+    "fmla v16.4s, v23.4s, v4.s[1]\n"
+    "fmla v17.4s, v23.4s, v4.s[2]\n"
+    "fmla v18.4s, v23.4s, v4.s[3]\n"
+    "fmla v19.4s, v23.4s, v5.s[0]\n"
+    "ldr q23, [%x[params], #0x60]\n"
+    "fmla v12.4s, v22.4s, v2.s[2]\n"
+    "fmla v13.4s, v22.4s, v2.s[3]\n"
+    "fmla v14.4s, v22.4s, v3.s[0]\n"
+    "fmla v15.4s, v22.4s, v3.s[1]\n"
+    "fmla v16.4s, v22.4s, v4.s[2]\n"
+    "fmla v17.4s, v22.4s, v4.s[3]\n"
+    "fmla v18.4s, v22.4s, v5.s[0]\n"
+    "fmla v19.4s, v22.4s, v5.s[1]\n"
+    "ldr q22, [%x[params], #0x70]\n"
+    "fmla v12.4s, v21.4s, v2.s[3]\n"
+    "fmla v13.4s, v21.4s, v3.s[0]\n"
+    "fmla v14.4s, v21.4s, v3.s[1]\n"
+    "fmla v15.4s, v21.4s, v3.s[2]\n"
+    "fmla v16.4s, v21.4s, v4.s[3]\n"
+    "fmla v17.4s, v21.4s, v5.s[0]\n"
+    "fmla v18.4s, v21.4s, v5.s[1]\n"
+    "fmla v19.4s, v21.4s, v5.s[2]\n"
+    "ldr q21, [%x[params], #0x80]\n"
+    "fmla v12.4s, v20.4s, v3.s[0]\n"
+    "fmla v13.4s, v20.4s, v3.s[1]\n"
+    "fmla v14.4s, v20.4s, v3.s[2]\n"
+    "fmla v15.4s, v20.4s, v3.s[3]\n"
+    "fmla v16.4s, v20.4s, v5.s[0]\n"
+    "fmla v17.4s, v20.4s, v5.s[1]\n"
+    "fmla v18.4s, v20.4s, v5.s[2]\n"
+    "fmla v19.4s, v20.4s, v5.s[3]\n"
+    "ldr q20, [%x[params], #0x90]\n"
+    "fmla v12.4s, v24.4s, v4.s[0]\n"
+    "fmla v13.4s, v24.4s, v4.s[1]\n"
+    "fmla v14.4s, v24.4s, v4.s[2]\n"
+    "fmla v15.4s, v24.4s, v4.s[3]\n"
+    "fmla v16.4s, v24.4s, v6.s[0]\n"
+    "fmla v17.4s, v24.4s, v6.s[1]\n"
+    "fmla v18.4s, v24.4s, v6.s[2]\n"
+    "fmla v19.4s, v24.4s, v6.s[3]\n"
+    "ldr q24, [%x[params], #0xa0]\n"
+    "fmla v12.4s, v23.4s, v4.s[1]\n"
+    "fmla v13.4s, v23.4s, v4.s[2]\n"
+    "fmla v14.4s, v23.4s, v4.s[3]\n"
+    "fmla v15.4s, v23.4s, v5.s[0]\n"
+    "fmla v16.4s, v23.4s, v6.s[1]\n"
+    "fmla v17.4s, v23.4s, v6.s[2]\n"
+    "fmla v18.4s, v23.4s, v6.s[3]\n"
+    "fmla v19.4s, v23.4s, v7.s[0]\n"
+    "ldr q23, [%x[params], #0xb0]\n"
+    "fmla v12.4s, v22.4s, v4.s[2]\n"
+    "fmla v13.4s, v22.4s, v4.s[3]\n"
+    "fmla v14.4s, v22.4s, v5.s[0]\n"
+    "fmla v15.4s, v22.4s, v5.s[1]\n"
+    "fmla v16.4s, v22.4s, v6.s[2]\n"
+    "fmla v17.4s, v22.4s, v6.s[3]\n"
+    "fmla v18.4s, v22.4s, v7.s[0]\n"
+    "fmla v19.4s, v22.4s, v7.s[1]\n"
+    "ldr q22, [%x[params], #0xc0]\n"
+    "fmla v12.4s, v21.4s, v4.s[3]\n"
+    "fmla v13.4s, v21.4s, v5.s[0]\n"
+    "fmla v14.4s, v21.4s, v5.s[1]\n"
+    "fmla v15.4s, v21.4s, v5.s[2]\n"
+    "fmla v16.4s, v21.4s, v6.s[3]\n"
+    "fmla v17.4s, v21.4s, v7.s[0]\n"
+    "fmla v18.4s, v21.4s, v7.s[1]\n"
+    "fmla v19.4s, v21.4s, v7.s[2]\n"
+    "ldr q21, [%x[params], #0xd0]\n"
+    "fmla v12.4s, v20.4s, v5.s[0]\n"
+    "fmla v13.4s, v20.4s, v5.s[1]\n"
+    "fmla v14.4s, v20.4s, v5.s[2]\n"
+    "fmla v15.4s, v20.4s, v5.s[3]\n"
+    "fmla v16.4s, v20.4s, v7.s[0]\n"
+    "fmla v17.4s, v20.4s, v7.s[1]\n"
+    "fmla v18.4s, v20.4s, v7.s[2]\n"
+    "fmla v19.4s, v20.4s, v7.s[3]\n"
+    "ldr q20, [%x[params], #0xe0]\n"
+    "fmla v12.4s, v24.4s, v6.s[0]\n"
+    "fmla v13.4s, v24.4s, v6.s[1]\n"
+    "fmla v14.4s, v24.4s, v6.s[2]\n"
+    "fmla v15.4s, v24.4s, v6.s[3]\n"
+    "fmla v16.4s, v24.4s, v8.s[0]\n"
+    "fmla v17.4s, v24.4s, v8.s[1]\n"
+    "fmla v18.4s, v24.4s, v8.s[2]\n"
+    "fmla v19.4s, v24.4s, v8.s[3]\n"
+    "ldr q24, [%x[params], #0xf0]\n"
+    "fmla v12.4s, v23.4s, v6.s[1]\n"
+    "fmla v13.4s, v23.4s, v6.s[2]\n"
+    "fmla v14.4s, v23.4s, v6.s[3]\n"
+    "fmla v15.4s, v23.4s, v7.s[0]\n"
+    "fmla v16.4s, v23.4s, v8.s[1]\n"
+    "fmla v17.4s, v23.4s, v8.s[2]\n"
+    "fmla v18.4s, v23.4s, v8.s[3]\n"
+    "fmla v19.4s, v23.4s, v9.s[0]\n"
+    "ldr q23, [%x[params], #0x100]\n"
+    "fmla v12.4s, v22.4s, v6.s[2]\n"
+    "fmla v13.4s, v22.4s, v6.s[3]\n"
+    "fmla v14.4s, v22.4s, v7.s[0]\n"
+    "fmla v15.4s, v22.4s, v7.s[1]\n"
+    "fmla v16.4s, v22.4s, v8.s[2]\n"
+    "fmla v17.4s, v22.4s, v8.s[3]\n"
+    "fmla v18.4s, v22.4s, v9.s[0]\n"
+    "fmla v19.4s, v22.4s, v9.s[1]\n"
+    "ldr q22, [%x[params], #0x110]\n"
+    "fmla v12.4s, v21.4s, v6.s[3]\n"
+    "fmla v13.4s, v21.4s, v7.s[0]\n"
+    "fmla v14.4s, v21.4s, v7.s[1]\n"
+    "fmla v15.4s, v21.4s, v7.s[2]\n"
+    "fmla v16.4s, v21.4s, v8.s[3]\n"
+    "fmla v17.4s, v21.4s, v9.s[0]\n"
+    "fmla v18.4s, v21.4s, v9.s[1]\n"
+    "fmla v19.4s, v21.4s, v9.s[2]\n"
+    "ldr q21, [%x[params], #0x120]\n"
+    "fmla v12.4s, v20.4s, v7.s[0]\n"
+    "fmla v13.4s, v20.4s, v7.s[1]\n"
+    "fmla v14.4s, v20.4s, v7.s[2]\n"
+    "fmla v15.4s, v20.4s, v7.s[3]\n"
+    "fmla v16.4s, v20.4s, v9.s[0]\n"
+    "fmla v17.4s, v20.4s, v9.s[1]\n"
+    "fmla v18.4s, v20.4s, v9.s[2]\n"
+    "fmla v19.4s, v20.4s, v9.s[3]\n"
+    "ldr q20, [%x[params], #0x130]\n"
+    "fmla v12.4s, v24.4s, v8.s[0]\n"
+    "fmla v13.4s, v24.4s, v8.s[1]\n"
+    "fmla v14.4s, v24.4s, v8.s[2]\n"
+    "fmla v15.4s, v24.4s, v8.s[3]\n"
+    "fmla v16.4s, v24.4s, v10.s[0]\n"
+    "fmla v17.4s, v24.4s, v10.s[1]\n"
+    "fmla v18.4s, v24.4s, v10.s[2]\n"
+    "fmla v19.4s, v24.4s, v10.s[3]\n"
+    "ldr q31, [%x[params], #0x150]\n"
+    "fmla v12.4s, v23.4s, v8.s[1]\n"
+    "fmla v13.4s, v23.4s, v8.s[2]\n"
+    "fmla v14.4s, v23.4s, v8.s[3]\n"
+    "fmla v15.4s, v23.4s, v9.s[0]\n"
+    "fmla v16.4s, v23.4s, v10.s[1]\n"
+    "fmla v17.4s, v23.4s, v10.s[2]\n"
+    "fmla v18.4s, v23.4s, v10.s[3]\n"
+    "fmla v19.4s, v23.4s, v11.s[0]\n"
+    "ldr q30, [%x[params], #0x160]\n"
+    "fmla v12.4s, v22.4s, v8.s[2]\n"
+    "fmla v13.4s, v22.4s, v8.s[3]\n"
+    "fmla v14.4s, v22.4s, v9.s[0]\n"
+    "fmla v15.4s, v22.4s, v9.s[1]\n"
+    "fmla v16.4s, v22.4s, v10.s[2]\n"
+    "fmla v17.4s, v22.4s, v10.s[3]\n"
+    "fmla v18.4s, v22.4s, v11.s[0]\n"
+    "fmla v19.4s, v22.4s, v11.s[1]\n"
+    "ldr q29, [%x[params], #0x170]\n"
+    "fmla v12.4s, v21.4s, v8.s[3]\n"
+    "fmla v13.4s, v21.4s, v9.s[0]\n"
+    "fmla v14.4s, v21.4s, v9.s[1]\n"
+    "fmla v15.4s, v21.4s, v9.s[2]\n"
+    "fmla v16.4s, v21.4s, v10.s[3]\n"
+    "fmla v17.4s, v21.4s, v11.s[0]\n"
+    "fmla v18.4s, v21.4s, v11.s[1]\n"
+    "fmla v19.4s, v21.4s, v11.s[2]\n"
+    "ldr q28, [%x[params], #0x180]\n"
+    "fmla v12.4s, v20.4s, v9.s[0]\n"
+    "fmla v13.4s, v20.4s, v9.s[1]\n"
+    "fmin v12.4s, v12.4s, v25.4s\n"
+    "fmla v14.4s, v20.4s, v9.s[2]\n"
+    "fmla v15.4s, v20.4s, v9.s[3]\n"
+    "fmax v12.4s, v12.4s, v26.4s\n"
+    "str q12, [x12, x13]\n"
+    "ldr q12, [%x[params], #0x140]\n"
+    "fmla v16.4s, v20.4s, v11.s[0]\n"
+    "fmla v17.4s, v20.4s, v11.s[1]\n"
+    "fmin v13.4s, v13.4s, v25.4s\n"
+    "fmla v18.4s, v20.4s, v11.s[2]\n"
+    "fmla v19.4s, v20.4s, v11.s[3]\n"
+    "ldr q27, [%x[params], #0x190]\n"
+    "fmin v14.4s, v14.4s, v25.4s\n"
+    "fmin v15.4s, v15.4s, v25.4s\n"
+    "fmin v16.4s, v16.4s, v25.4s\n"
+    "add %x[params], %x[params], #0x1a0\n"
+    "fmin v17.4s, v17.4s, v25.4s\n"
+    "fmin v18.4s, v18.4s, v25.4s\n"
+    "fmin v19.4s, v19.4s, v25.4s\n"
+    "fmax v13.4s, v13.4s, v26.4s\n"
+    "str q13, [x11, x13]\n"
+    "fmax v14.4s, v14.4s, v26.4s\n"
+    "fmax v15.4s, v15.4s, v26.4s\n"
+    "str q14, [x10, x13]\n"
+    "fmax v16.4s, v16.4s, v26.4s\n"
+    "fmax v17.4s, v17.4s, v26.4s\n"
+    "str q15, [x9, x13]\n"
+    "fmax v18.4s, v18.4s, v26.4s\n"
+    "fmax v19.4s, v19.4s, v26.4s\n"
+    "str q16, [x28, x13]\n"
+    "str q17, [x27, x13]\n"
+    "mov v13.16b, v12.16b\n"
+    "mov v14.16b, v12.16b\n"
+    "str q18, [x26, x13]\n"
+    "mov v15.16b, v12.16b\n"
+    "mov v16.16b, v12.16b\n"
+    "str q19, [x25, x13]\n"
+    "mov v17.16b, v12.16b\n"
+    "mov v18.16b, v12.16b\n"
+    "add x13, x13, #0x10\n"
+    "mov v19.16b, v12.16b\n"
+    "bgt 1b\n"
+    "2:"  // Output channel complete vector tail
+    "fmla v12.4s, v31.4s, v0.s[0]\n"
+    "fmla v13.4s, v31.4s, v0.s[1]\n"
+    "fmla v14.4s, v31.4s, v0.s[2]\n"
+    "fmla v15.4s, v31.4s, v0.s[3]\n"
+    "fmla v16.4s, v31.4s, v2.s[0]\n"
+    "fmla v17.4s, v31.4s, v2.s[1]\n"
+    "fmla v18.4s, v31.4s, v2.s[2]\n"
+    "fmla v19.4s, v31.4s, v2.s[3]\n"
+    "ldr q24, [%x[params], #0x0]\n"
+    "fmla v12.4s, v30.4s, v0.s[1]\n"
+    "fmla v13.4s, v30.4s, v0.s[2]\n"
+    "fmla v14.4s, v30.4s, v0.s[3]\n"
+    "fmla v15.4s, v30.4s, v1.s[0]\n"
+    "fmla v16.4s, v30.4s, v2.s[1]\n"
+    "fmla v17.4s, v30.4s, v2.s[2]\n"
+    "fmla v18.4s, v30.4s, v2.s[3]\n"
+    "fmla v19.4s, v30.4s, v3.s[0]\n"
+    "ldr q23, [%x[params], #0x10]\n"
+    "fmla v12.4s, v29.4s, v0.s[2]\n"
+    "fmla v13.4s, v29.4s, v0.s[3]\n"
+    "fmla v14.4s, v29.4s, v1.s[0]\n"
+    "fmla v15.4s, v29.4s, v1.s[1]\n"
+    "fmla v16.4s, v29.4s, v2.s[2]\n"
+    "fmla v17.4s, v29.4s, v2.s[3]\n"
+    "fmla v18.4s, v29.4s, v3.s[0]\n"
+    "fmla v19.4s, v29.4s, v3.s[1]\n"
+    "ldr q22, [%x[params], #0x20]\n"
+    "fmla v12.4s, v28.4s, v0.s[3]\n"
+    "fmla v13.4s, v28.4s, v1.s[0]\n"
+    "fmla v14.4s, v28.4s, v1.s[1]\n"
+    "fmla v15.4s, v28.4s, v1.s[2]\n"
+    "fmla v16.4s, v28.4s, v2.s[3]\n"
+    "fmla v17.4s, v28.4s, v3.s[0]\n"
+    "fmla v18.4s, v28.4s, v3.s[1]\n"
+    "fmla v19.4s, v28.4s, v3.s[2]\n"
+    "ldr q21, [%x[params], #0x30]\n"
+    "fmla v12.4s, v27.4s, v1.s[0]\n"
+    "fmla v13.4s, v27.4s, v1.s[1]\n"
+    "fmla v14.4s, v27.4s, v1.s[2]\n"
+    "fmla v15.4s, v27.4s, v1.s[3]\n"
+    "fmla v16.4s, v27.4s, v3.s[0]\n"
+    "fmla v17.4s, v27.4s, v3.s[1]\n"
+    "fmla v18.4s, v27.4s, v3.s[2]\n"
+    "fmla v19.4s, v27.4s, v3.s[3]\n"
+    "ldr q20, [%x[params], #0x40]\n"
+    "fmla v12.4s, v24.4s, v2.s[0]\n"
+    "fmla v13.4s, v24.4s, v2.s[1]\n"
+    "fmla v14.4s, v24.4s, v2.s[2]\n"
+    "fmla v15.4s, v24.4s, v2.s[3]\n"
+    "fmla v16.4s, v24.4s, v4.s[0]\n"
+    "fmla v17.4s, v24.4s, v4.s[1]\n"
+    "fmla v18.4s, v24.4s, v4.s[2]\n"
+    "fmla v19.4s, v24.4s, v4.s[3]\n"
+    "ldr q24, [%x[params], #0x50]\n"
+    "fmla v12.4s, v23.4s, v2.s[1]\n"
+    "fmla v13.4s, v23.4s, v2.s[2]\n"
+    "fmla v14.4s, v23.4s, v2.s[3]\n"
+    "fmla v15.4s, v23.4s, v3.s[0]\n"
+    "fmla v16.4s, v23.4s, v4.s[1]\n"
+    "fmla v17.4s, v23.4s, v4.s[2]\n"
+    "fmla v18.4s, v23.4s, v4.s[3]\n"
+    "fmla v19.4s, v23.4s, v5.s[0]\n"
+    "ldr q23, [%x[params], #0x60]\n"
+    "fmla v12.4s, v22.4s, v2.s[2]\n"
+    "fmla v13.4s, v22.4s, v2.s[3]\n"
+    "fmla v14.4s, v22.4s, v3.s[0]\n"
+    "fmla v15.4s, v22.4s, v3.s[1]\n"
+    "fmla v16.4s, v22.4s, v4.s[2]\n"
+    "fmla v17.4s, v22.4s, v4.s[3]\n"
+    "fmla v18.4s, v22.4s, v5.s[0]\n"
+    "fmla v19.4s, v22.4s, v5.s[1]\n"
+    "ldr q22, [%x[params], #0x70]\n"
+    "fmla v12.4s, v21.4s, v2.s[3]\n"
+    "fmla v13.4s, v21.4s, v3.s[0]\n"
+    "fmla v14.4s, v21.4s, v3.s[1]\n"
+    "fmla v15.4s, v21.4s, v3.s[2]\n"
+    "fmla v16.4s, v21.4s, v4.s[3]\n"
+    "fmla v17.4s, v21.4s, v5.s[0]\n"
+    "fmla v18.4s, v21.4s, v5.s[1]\n"
+    "fmla v19.4s, v21.4s, v5.s[2]\n"
+    "ldr q21, [%x[params], #0x80]\n"
+    "fmla v12.4s, v20.4s, v3.s[0]\n"
+    "fmla v13.4s, v20.4s, v3.s[1]\n"
+    "fmla v14.4s, v20.4s, v3.s[2]\n"
+    "fmla v15.4s, v20.4s, v3.s[3]\n"
+    "fmla v16.4s, v20.4s, v5.s[0]\n"
+    "fmla v17.4s, v20.4s, v5.s[1]\n"
+    "fmla v18.4s, v20.4s, v5.s[2]\n"
+    "fmla v19.4s, v20.4s, v5.s[3]\n"
+    "ldr q20, [%x[params], #0x90]\n"
+    "fmla v12.4s, v24.4s, v4.s[0]\n"
+    "fmla v13.4s, v24.4s, v4.s[1]\n"
+    "fmla v14.4s, v24.4s, v4.s[2]\n"
+    "fmla v15.4s, v24.4s, v4.s[3]\n"
+    "fmla v16.4s, v24.4s, v6.s[0]\n"
+    "fmla v17.4s, v24.4s, v6.s[1]\n"
+    "fmla v18.4s, v24.4s, v6.s[2]\n"
+    "fmla v19.4s, v24.4s, v6.s[3]\n"
+    "ldr q24, [%x[params], #0xa0]\n"
+    "fmla v12.4s, v23.4s, v4.s[1]\n"
+    "fmla v13.4s, v23.4s, v4.s[2]\n"
+    "fmla v14.4s, v23.4s, v4.s[3]\n"
+    "fmla v15.4s, v23.4s, v5.s[0]\n"
+    "fmla v16.4s, v23.4s, v6.s[1]\n"
+    "fmla v17.4s, v23.4s, v6.s[2]\n"
+    "fmla v18.4s, v23.4s, v6.s[3]\n"
+    "fmla v19.4s, v23.4s, v7.s[0]\n"
+    "ldr q23, [%x[params], #0xb0]\n"
+    "fmla v12.4s, v22.4s, v4.s[2]\n"
+    "fmla v13.4s, v22.4s, v4.s[3]\n"
+    "fmla v14.4s, v22.4s, v5.s[0]\n"
+    "fmla v15.4s, v22.4s, v5.s[1]\n"
+    "fmla v16.4s, v22.4s, v6.s[2]\n"
+    "fmla v17.4s, v22.4s, v6.s[3]\n"
+    "fmla v18.4s, v22.4s, v7.s[0]\n"
+    "fmla v19.4s, v22.4s, v7.s[1]\n"
+    "ldr q22, [%x[params], #0xc0]\n"
+    "fmla v12.4s, v21.4s, v4.s[3]\n"
+    "fmla v13.4s, v21.4s, v5.s[0]\n"
+    "fmla v14.4s, v21.4s, v5.s[1]\n"
+    "fmla v15.4s, v21.4s, v5.s[2]\n"
+    "fmla v16.4s, v21.4s, v6.s[3]\n"
+    "fmla v17.4s, v21.4s, v7.s[0]\n"
+    "fmla v18.4s, v21.4s, v7.s[1]\n"
+    "fmla v19.4s, v21.4s, v7.s[2]\n"
+    "ldr q21, [%x[params], #0xd0]\n"
+    "fmla v12.4s, v20.4s, v5.s[0]\n"
+    "fmla v13.4s, v20.4s, v5.s[1]\n"
+    "fmla v14.4s, v20.4s, v5.s[2]\n"
+    "fmla v15.4s, v20.4s, v5.s[3]\n"
+    "fmla v16.4s, v20.4s, v7.s[0]\n"
+    "fmla v17.4s, v20.4s, v7.s[1]\n"
+    "fmla v18.4s, v20.4s, v7.s[2]\n"
+    "fmla v19.4s, v20.4s, v7.s[3]\n"
+    "ldr q20, [%x[params], #0xe0]\n"
+    "fmla v12.4s, v24.4s, v6.s[0]\n"
+    "fmla v13.4s, v24.4s, v6.s[1]\n"
+    "fmla v14.4s, v24.4s, v6.s[2]\n"
+    "fmla v15.4s, v24.4s, v6.s[3]\n"
+    "fmla v16.4s, v24.4s, v8.s[0]\n"
+    "fmla v17.4s, v24.4s, v8.s[1]\n"
+    "fmla v18.4s, v24.4s, v8.s[2]\n"
+    "fmla v19.4s, v24.4s, v8.s[3]\n"
+    "ldr q24, [%x[params], #0xf0]\n"
+    "fmla v12.4s, v23.4s, v6.s[1]\n"
+    "fmla v13.4s, v23.4s, v6.s[2]\n"
+    "fmla v14.4s, v23.4s, v6.s[3]\n"
+    "fmla v15.4s, v23.4s, v7.s[0]\n"
+    "fmla v16.4s, v23.4s, v8.s[1]\n"
+    "fmla v17.4s, v23.4s, v8.s[2]\n"
+    "fmla v18.4s, v23.4s, v8.s[3]\n"
+    "fmla v19.4s, v23.4s, v9.s[0]\n"
+    "ldr q23, [%x[params], #0x100]\n"
+    "fmla v12.4s, v22.4s, v6.s[2]\n"
+    "fmla v13.4s, v22.4s, v6.s[3]\n"
+    "fmla v14.4s, v22.4s, v7.s[0]\n"
+    "fmla v15.4s, v22.4s, v7.s[1]\n"
+    "fmla v16.4s, v22.4s, v8.s[2]\n"
+    "fmla v17.4s, v22.4s, v8.s[3]\n"
+    "fmla v18.4s, v22.4s, v9.s[0]\n"
+    "fmla v19.4s, v22.4s, v9.s[1]\n"
+    "ldr q22, [%x[params], #0x110]\n"
+    "fmla v12.4s, v21.4s, v6.s[3]\n"
+    "fmla v13.4s, v21.4s, v7.s[0]\n"
+    "fmla v14.4s, v21.4s, v7.s[1]\n"
+    "fmla v15.4s, v21.4s, v7.s[2]\n"
+    "fmla v16.4s, v21.4s, v8.s[3]\n"
+    "fmla v17.4s, v21.4s, v9.s[0]\n"
+    "fmla v18.4s, v21.4s, v9.s[1]\n"
+    "fmla v19.4s, v21.4s, v9.s[2]\n"
+    "ldr q21, [%x[params], #0x120]\n"
+    "fmla v12.4s, v20.4s, v7.s[0]\n"
+    "fmla v13.4s, v20.4s, v7.s[1]\n"
+    "fmla v14.4s, v20.4s, v7.s[2]\n"
+    "fmla v15.4s, v20.4s, v7.s[3]\n"
+    "fmla v16.4s, v20.4s, v9.s[0]\n"
+    "fmla v17.4s, v20.4s, v9.s[1]\n"
+    "fmla v18.4s, v20.4s, v9.s[2]\n"
+    "fmla v19.4s, v20.4s, v9.s[3]\n"
+    "ldr q20, [%x[params], #0x130]\n"
+    "add %x[params], %x[params], #0x140\n"
+    "fmla v12.4s, v24.4s, v8.s[0]\n"
+    "fmla v13.4s, v24.4s, v8.s[1]\n"
+    "fmla v14.4s, v24.4s, v8.s[2]\n"
+    "fmla v15.4s, v24.4s, v8.s[3]\n"
+    "fmla v16.4s, v24.4s, v10.s[0]\n"
+    "fmla v17.4s, v24.4s, v10.s[1]\n"
+    "fmla v18.4s, v24.4s, v10.s[2]\n"
+    "fmla v19.4s, v24.4s, v10.s[3]\n"
+    "fmla v12.4s, v23.4s, v8.s[1]\n"
+    "fmla v13.4s, v23.4s, v8.s[2]\n"
+    "fmla v14.4s, v23.4s, v8.s[3]\n"
+    "fmla v15.4s, v23.4s, v9.s[0]\n"
+    "fmla v16.4s, v23.4s, v10.s[1]\n"
+    "fmla v17.4s, v23.4s, v10.s[2]\n"
+    "fmla v18.4s, v23.4s, v10.s[3]\n"
+    "fmla v19.4s, v23.4s, v11.s[0]\n"
+    "fmla v12.4s, v22.4s, v8.s[2]\n"
+    "fmla v13.4s, v22.4s, v8.s[3]\n"
+    "fmla v14.4s, v22.4s, v9.s[0]\n"
+    "fmla v15.4s, v22.4s, v9.s[1]\n"
+    "fmla v16.4s, v22.4s, v10.s[2]\n"
+    "fmla v17.4s, v22.4s, v10.s[3]\n"
+    "fmla v18.4s, v22.4s, v11.s[0]\n"
+    "fmla v19.4s, v22.4s, v11.s[1]\n"
+    "fmla v12.4s, v21.4s, v8.s[3]\n"
+    "fmla v13.4s, v21.4s, v9.s[0]\n"
+    "fmla v14.4s, v21.4s, v9.s[1]\n"
+    "fmla v15.4s, v21.4s, v9.s[2]\n"
+    "fmla v16.4s, v21.4s, v10.s[3]\n"
+    "fmla v17.4s, v21.4s, v11.s[0]\n"
+    "fmla v18.4s, v21.4s, v11.s[1]\n"
+    "fmla v19.4s, v21.4s, v11.s[2]\n"
+    "fmla v12.4s, v20.4s, v9.s[0]\n"
+    "fmla v13.4s, v20.4s, v9.s[1]\n"
+    "fmin v12.4s, v12.4s, v25.4s\n"
+    "fmla v14.4s, v20.4s, v9.s[2]\n"
+    "fmla v15.4s, v20.4s, v9.s[3]\n"
+    "fmin v13.4s, v13.4s, v25.4s\n"
+    "fmla v16.4s, v20.4s, v11.s[0]\n"
+    "fmla v17.4s, v20.4s, v11.s[1]\n"
+    "fmin v14.4s, v14.4s, v25.4s\n"
+    "fmla v18.4s, v20.4s, v11.s[2]\n"
+    "fmla v19.4s, v20.4s, v11.s[3]\n"
+    "fmin v15.4s, v15.4s, v25.4s\n"
+    "fmin v16.4s, v16.4s, v25.4s\n"
+    "fmin v17.4s, v17.4s, v25.4s\n"
+    "fmin v18.4s, v18.4s, v25.4s\n"
+    "fmin v19.4s, v19.4s, v25.4s\n"
+    "fmax v12.4s, v12.4s, v26.4s\n"
+    "fmax v13.4s, v13.4s, v26.4s\n"
+    "str q12, [x12, x13]\n"
+    "fmax v14.4s, v14.4s, v26.4s\n"
+    "fmax v15.4s, v15.4s, v26.4s\n"
+    "str q13, [x11, x13]\n"
+    "fmax v16.4s, v16.4s, v26.4s\n"
+    "fmax v17.4s, v17.4s, v26.4s\n"
+    "str q14, [x10, x13]\n"
+    "fmax v18.4s, v18.4s, v26.4s\n"
+    "fmax v19.4s, v19.4s, v26.4s\n"
+    "str q15, [x9, x13]\n"
+    "str q16, [x28, x13]\n"
+    "str q17, [x27, x13]\n"
+    "str q18, [x26, x13]\n"
+    "str q19, [x25, x13]\n"
+    "add x13, x13, #0x10\n"
+    "3:"  // Output channel oddments
+    "tst %x[channel_multiplier], #0x3\n"
+    "beq 6f\n"
+    "ldr q12, [%x[params], #0x0]\n"
+    "ldr q24, [%x[params], #0x10]\n"
+    "mov v13.16b, v12.16b\n"
+    "mov v14.16b, v12.16b\n"
+    "ldr q23, [%x[params], #0x20]\n"
+    "ldr q22, [%x[params], #0x30]\n"
+    "mov v15.16b, v12.16b\n"
+    "mov v16.16b, v12.16b\n"
+    "ldr q21, [%x[params], #0x40]\n"
+    "ldr q20, [%x[params], #0x50]\n"
+    "mov v17.16b, v12.16b\n"
+    "mov v18.16b, v12.16b\n"
+    "mov v19.16b, v12.16b\n"
+    "fmla v12.4s, v24.4s, v0.s[0]\n"
+    "fmla v13.4s, v24.4s, v0.s[1]\n"
+    "fmla v14.4s, v24.4s, v0.s[2]\n"
+    "fmla v15.4s, v24.4s, v0.s[3]\n"
+    "fmla v16.4s, v24.4s, v2.s[0]\n"
+    "fmla v17.4s, v24.4s, v2.s[1]\n"
+    "fmla v18.4s, v24.4s, v2.s[2]\n"
+    "fmla v19.4s, v24.4s, v2.s[3]\n"
+    "ldr q24, [%x[params], #0x60]\n"
+    "fmla v12.4s, v23.4s, v0.s[1]\n"
+    "fmla v13.4s, v23.4s, v0.s[2]\n"
+    "fmla v14.4s, v23.4s, v0.s[3]\n"
+    "fmla v15.4s, v23.4s, v1.s[0]\n"
+    "fmla v16.4s, v23.4s, v2.s[1]\n"
+    "fmla v17.4s, v23.4s, v2.s[2]\n"
+    "fmla v18.4s, v23.4s, v2.s[3]\n"
+    "fmla v19.4s, v23.4s, v3.s[0]\n"
+    "ldr q23, [%x[params], #0x70]\n"
+    "fmla v12.4s, v22.4s, v0.s[2]\n"
+    "fmla v13.4s, v22.4s, v0.s[3]\n"
+    "fmla v14.4s, v22.4s, v1.s[0]\n"
+    "fmla v15.4s, v22.4s, v1.s[1]\n"
+    "fmla v16.4s, v22.4s, v2.s[2]\n"
+    "fmla v17.4s, v22.4s, v2.s[3]\n"
+    "fmla v18.4s, v22.4s, v3.s[0]\n"
+    "fmla v19.4s, v22.4s, v3.s[1]\n"
+    "ldr q22, [%x[params], #0x80]\n"
+    "fmla v12.4s, v21.4s, v0.s[3]\n"
+    "fmla v13.4s, v21.4s, v1.s[0]\n"
+    "fmla v14.4s, v21.4s, v1.s[1]\n"
+    "fmla v15.4s, v21.4s, v1.s[2]\n"
+    "fmla v16.4s, v21.4s, v2.s[3]\n"
+    "fmla v17.4s, v21.4s, v3.s[0]\n"
+    "fmla v18.4s, v21.4s, v3.s[1]\n"
+    "fmla v19.4s, v21.4s, v3.s[2]\n"
+    "ldr q21, [%x[params], #0x90]\n"
+    "fmla v12.4s, v20.4s, v1.s[0]\n"
+    "fmla v13.4s, v20.4s, v1.s[1]\n"
+    "fmla v14.4s, v20.4s, v1.s[2]\n"
+    "fmla v15.4s, v20.4s, v1.s[3]\n"
+    "fmla v16.4s, v20.4s, v3.s[0]\n"
+    "fmla v17.4s, v20.4s, v3.s[1]\n"
+    "fmla v18.4s, v20.4s, v3.s[2]\n"
+    "fmla v19.4s, v20.4s, v3.s[3]\n"
+    "ldr q20, [%x[params], #0xa0]\n"
+    "fmla v12.4s, v24.4s, v2.s[0]\n"
+    "fmla v13.4s, v24.4s, v2.s[1]\n"
+    "fmla v14.4s, v24.4s, v2.s[2]\n"
+    "fmla v15.4s, v24.4s, v2.s[3]\n"
+    "fmla v16.4s, v24.4s, v4.s[0]\n"
+    "fmla v17.4s, v24.4s, v4.s[1]\n"
+    "fmla v18.4s, v24.4s, v4.s[2]\n"
+    "fmla v19.4s, v24.4s, v4.s[3]\n"
+    "ldr q24, [%x[params], #0xb0]\n"
+    "fmla v12.4s, v23.4s, v2.s[1]\n"
+    "fmla v13.4s, v23.4s, v2.s[2]\n"
+    "fmla v14.4s, v23.4s, v2.s[3]\n"
+    "fmla v15.4s, v23.4s, v3.s[0]\n"
+    "fmla v16.4s, v23.4s, v4.s[1]\n"
+    "fmla v17.4s, v23.4s, v4.s[2]\n"
+    "fmla v18.4s, v23.4s, v4.s[3]\n"
+    "fmla v19.4s, v23.4s, v5.s[0]\n"
+    "ldr q23, [%x[params], #0xc0]\n"
+    "fmla v12.4s, v22.4s, v2.s[2]\n"
+    "fmla v13.4s, v22.4s, v2.s[3]\n"
+    "fmla v14.4s, v22.4s, v3.s[0]\n"
+    "fmla v15.4s, v22.4s, v3.s[1]\n"
+    "fmla v16.4s, v22.4s, v4.s[2]\n"
+    "fmla v17.4s, v22.4s, v4.s[3]\n"
+    "fmla v18.4s, v22.4s, v5.s[0]\n"
+    "fmla v19.4s, v22.4s, v5.s[1]\n"
+    "ldr q22, [%x[params], #0xd0]\n"
+    "fmla v12.4s, v21.4s, v2.s[3]\n"
+    "fmla v13.4s, v21.4s, v3.s[0]\n"
+    "fmla v14.4s, v21.4s, v3.s[1]\n"
+    "fmla v15.4s, v21.4s, v3.s[2]\n"
+    "fmla v16.4s, v21.4s, v4.s[3]\n"
+    "fmla v17.4s, v21.4s, v5.s[0]\n"
+    "fmla v18.4s, v21.4s, v5.s[1]\n"
+    "fmla v19.4s, v21.4s, v5.s[2]\n"
+    "ldr q21, [%x[params], #0xe0]\n"
+    "fmla v12.4s, v20.4s, v3.s[0]\n"
+    "fmla v13.4s, v20.4s, v3.s[1]\n"
+    "fmla v14.4s, v20.4s, v3.s[2]\n"
+    "fmla v15.4s, v20.4s, v3.s[3]\n"
+    "fmla v16.4s, v20.4s, v5.s[0]\n"
+    "fmla v17.4s, v20.4s, v5.s[1]\n"
+    "fmla v18.4s, v20.4s, v5.s[2]\n"
+    "fmla v19.4s, v20.4s, v5.s[3]\n"
+    "ldr q20, [%x[params], #0xf0]\n"
+    "fmla v12.4s, v24.4s, v4.s[0]\n"
+    "fmla v13.4s, v24.4s, v4.s[1]\n"
+    "fmla v14.4s, v24.4s, v4.s[2]\n"
+    "fmla v15.4s, v24.4s, v4.s[3]\n"
+    "fmla v16.4s, v24.4s, v6.s[0]\n"
+    "fmla v17.4s, v24.4s, v6.s[1]\n"
+    "fmla v18.4s, v24.4s, v6.s[2]\n"
+    "fmla v19.4s, v24.4s, v6.s[3]\n"
+    "ldr q24, [%x[params], #0x100]\n"
+    "fmla v12.4s, v23.4s, v4.s[1]\n"
+    "fmla v13.4s, v23.4s, v4.s[2]\n"
+    "fmla v14.4s, v23.4s, v4.s[3]\n"
+    "fmla v15.4s, v23.4s, v5.s[0]\n"
+    "fmla v16.4s, v23.4s, v6.s[1]\n"
+    "fmla v17.4s, v23.4s, v6.s[2]\n"
+    "fmla v18.4s, v23.4s, v6.s[3]\n"
+    "fmla v19.4s, v23.4s, v7.s[0]\n"
+    "ldr q23, [%x[params], #0x110]\n"
+    "fmla v12.4s, v22.4s, v4.s[2]\n"
+    "fmla v13.4s, v22.4s, v4.s[3]\n"
+    "fmla v14.4s, v22.4s, v5.s[0]\n"
+    "fmla v15.4s, v22.4s, v5.s[1]\n"
+    "fmla v16.4s, v22.4s, v6.s[2]\n"
+    "fmla v17.4s, v22.4s, v6.s[3]\n"
+    "fmla v18.4s, v22.4s, v7.s[0]\n"
+    "fmla v19.4s, v22.4s, v7.s[1]\n"
+    "ldr q22, [%x[params], #0x120]\n"
+    "fmla v12.4s, v21.4s, v4.s[3]\n"
+    "fmla v13.4s, v21.4s, v5.s[0]\n"
+    "fmla v14.4s, v21.4s, v5.s[1]\n"
+    "fmla v15.4s, v21.4s, v5.s[2]\n"
+    "fmla v16.4s, v21.4s, v6.s[3]\n"
+    "fmla v17.4s, v21.4s, v7.s[0]\n"
+    "fmla v18.4s, v21.4s, v7.s[1]\n"
+    "fmla v19.4s, v21.4s, v7.s[2]\n"
+    "ldr q21, [%x[params], #0x130]\n"
+    "fmla v12.4s, v20.4s, v5.s[0]\n"
+    "fmla v13.4s, v20.4s, v5.s[1]\n"
+    "fmla v14.4s, v20.4s, v5.s[2]\n"
+    "fmla v15.4s, v20.4s, v5.s[3]\n"
+    "fmla v16.4s, v20.4s, v7.s[0]\n"
+    "fmla v17.4s, v20.4s, v7.s[1]\n"
+    "fmla v18.4s, v20.4s, v7.s[2]\n"
+    "fmla v19.4s, v20.4s, v7.s[3]\n"
+    "ldr q20, [%x[params], #0x140]\n"
+    "fmla v12.4s, v24.4s, v6.s[0]\n"
+    "fmla v13.4s, v24.4s, v6.s[1]\n"
+    "fmla v14.4s, v24.4s, v6.s[2]\n"
+    "fmla v15.4s, v24.4s, v6.s[3]\n"
+    "fmla v16.4s, v24.4s, v8.s[0]\n"
+    "fmla v17.4s, v24.4s, v8.s[1]\n"
+    "fmla v18.4s, v24.4s, v8.s[2]\n"
+    "fmla v19.4s, v24.4s, v8.s[3]\n"
+    "ldr q24, [%x[params], #0x150]\n"
+    "fmla v12.4s, v23.4s, v6.s[1]\n"
+    "fmla v13.4s, v23.4s, v6.s[2]\n"
+    "fmla v14.4s, v23.4s, v6.s[3]\n"
+    "fmla v15.4s, v23.4s, v7.s[0]\n"
+    "fmla v16.4s, v23.4s, v8.s[1]\n"
+    "fmla v17.4s, v23.4s, v8.s[2]\n"
+    "fmla v18.4s, v23.4s, v8.s[3]\n"
+    "fmla v19.4s, v23.4s, v9.s[0]\n"
+    "ldr q23, [%x[params], #0x160]\n"
+    "fmla v12.4s, v22.4s, v6.s[2]\n"
+    "fmla v13.4s, v22.4s, v6.s[3]\n"
+    "fmla v14.4s, v22.4s, v7.s[0]\n"
+    "fmla v15.4s, v22.4s, v7.s[1]\n"
+    "fmla v16.4s, v22.4s, v8.s[2]\n"
+    "fmla v17.4s, v22.4s, v8.s[3]\n"
+    "fmla v18.4s, v22.4s, v9.s[0]\n"
+    "fmla v19.4s, v22.4s, v9.s[1]\n"
+    "ldr q22, [%x[params], #0x170]\n"
+    "fmla v12.4s, v21.4s, v6.s[3]\n"
+    "fmla v13.4s, v21.4s, v7.s[0]\n"
+    "fmla v14.4s, v21.4s, v7.s[1]\n"
+    "fmla v15.4s, v21.4s, v7.s[2]\n"
+    "fmla v16.4s, v21.4s, v8.s[3]\n"
+    "fmla v17.4s, v21.4s, v9.s[0]\n"
+    "fmla v18.4s, v21.4s, v9.s[1]\n"
+    "fmla v19.4s, v21.4s, v9.s[2]\n"
+    "ldr q21, [%x[params], #0x180]\n"
+    "fmla v12.4s, v20.4s, v7.s[0]\n"
+    "fmla v13.4s, v20.4s, v7.s[1]\n"
+    "fmla v14.4s, v20.4s, v7.s[2]\n"
+    "fmla v15.4s, v20.4s, v7.s[3]\n"
+    "fmla v16.4s, v20.4s, v9.s[0]\n"
+    "fmla v17.4s, v20.4s, v9.s[1]\n"
+    "fmla v18.4s, v20.4s, v9.s[2]\n"
+    "fmla v19.4s, v20.4s, v9.s[3]\n"
+    "ldr q20, [%x[params], #0x190]\n"
+    "add %x[params], %x[params], #0x1a0\n"
+    "fmla v12.4s, v24.4s, v8.s[0]\n"
+    "fmla v13.4s, v24.4s, v8.s[1]\n"
+    "fmla v14.4s, v24.4s, v8.s[2]\n"
+    "fmla v15.4s, v24.4s, v8.s[3]\n"
+    "fmla v16.4s, v24.4s, v10.s[0]\n"
+    "fmla v17.4s, v24.4s, v10.s[1]\n"
+    "fmla v18.4s, v24.4s, v10.s[2]\n"
+    "fmla v19.4s, v24.4s, v10.s[3]\n"
+    "fmla v12.4s, v23.4s, v8.s[1]\n"
+    "fmla v13.4s, v23.4s, v8.s[2]\n"
+    "fmla v14.4s, v23.4s, v8.s[3]\n"
+    "fmla v15.4s, v23.4s, v9.s[0]\n"
+    "fmla v16.4s, v23.4s, v10.s[1]\n"
+    "fmla v17.4s, v23.4s, v10.s[2]\n"
+    "fmla v18.4s, v23.4s, v10.s[3]\n"
+    "fmla v19.4s, v23.4s, v11.s[0]\n"
+    "fmla v12.4s, v22.4s, v8.s[2]\n"
+    "fmla v13.4s, v22.4s, v8.s[3]\n"
+    "fmla v14.4s, v22.4s, v9.s[0]\n"
+    "fmla v15.4s, v22.4s, v9.s[1]\n"
+    "fmla v16.4s, v22.4s, v10.s[2]\n"
+    "fmla v17.4s, v22.4s, v10.s[3]\n"
+    "fmla v18.4s, v22.4s, v11.s[0]\n"
+    "fmla v19.4s, v22.4s, v11.s[1]\n"
+    "fmla v12.4s, v21.4s, v8.s[3]\n"
+    "fmla v13.4s, v21.4s, v9.s[0]\n"
+    "fmla v14.4s, v21.4s, v9.s[1]\n"
+    "fmla v15.4s, v21.4s, v9.s[2]\n"
+    "fmla v16.4s, v21.4s, v10.s[3]\n"
+    "fmla v17.4s, v21.4s, v11.s[0]\n"
+    "fmla v18.4s, v21.4s, v11.s[1]\n"
+    "fmla v19.4s, v21.4s, v11.s[2]\n"
+    "fmla v12.4s, v20.4s, v9.s[0]\n"
+    "fmla v13.4s, v20.4s, v9.s[1]\n"
+    "fmin v12.4s, v12.4s, v25.4s\n"
+    "fmla v14.4s, v20.4s, v9.s[2]\n"
+    "fmla v15.4s, v20.4s, v9.s[3]\n"
+    "fmin v13.4s, v13.4s, v25.4s\n"
+    "fmla v16.4s, v20.4s, v11.s[0]\n"
+    "fmla v17.4s, v20.4s, v11.s[1]\n"
+    "fmin v14.4s, v14.4s, v25.4s\n"
+    "fmla v18.4s, v20.4s, v11.s[2]\n"
+    "fmla v19.4s, v20.4s, v11.s[3]\n"
+    "fmin v15.4s, v15.4s, v25.4s\n"
+    "fmin v16.4s, v16.4s, v25.4s\n"
+    "fmin v17.4s, v17.4s, v25.4s\n"
+    "fmin v18.4s, v18.4s, v25.4s\n"
+    "fmin v19.4s, v19.4s, v25.4s\n"
+    "fmax v12.4s, v12.4s, v26.4s\n"
+    "fmax v13.4s, v13.4s, v26.4s\n"
+    "fmax v14.4s, v14.4s, v26.4s\n"
+    "fmax v15.4s, v15.4s, v26.4s\n"
+    "fmax v16.4s, v16.4s, v26.4s\n"
+    "fmax v17.4s, v17.4s, v26.4s\n"
+    "fmax v18.4s, v18.4s, v26.4s\n"
+    "fmax v19.4s, v19.4s, v26.4s\n"
+    "tbz %x[channel_multiplier], #1, 4f\n"
+    "add x20, x12, x13\n"
+    "add x21, x11, x13\n"
+    "st1 { v12.d }[0], [x20]\n"
+    "add x20, x10, x13\n"
+    "add x24, x9, x13\n"
+    "st1 { v13.d }[0], [x21]\n"
+    "add x23, x28, x13\n"
+    "add x22, x27, x13\n"
+    "st1 { v14.d }[0], [x20]\n"
+    "add x21, x26, x13\n"
+    "add x20, x25, x13\n"
+    "st1 { v15.d }[0], [x24]\n"
+    "st1 { v16.d }[0], [x23]\n"
+    "add x13, x13, #0x8\n"
+    "st1 { v17.d }[0], [x22]\n"
+    "st1 { v18.d }[0], [x21]\n"
+    "st1 { v19.d }[0], [x20]\n"
+    "tbz %x[channel_multiplier], #0, 5f\n"
+    "add x20, x12, x13\n"
+    "add x21, x11, x13\n"
+    "st1 { v12.s }[2], [x20]\n"
+    "add x20, x10, x13\n"
+    "add x24, x9, x13\n"
+    "st1 { v13.s }[2], [x21]\n"
+    "add x23, x28, x13\n"
+    "add x22, x27, x13\n"
+    "st1 { v14.s }[2], [x20]\n"
+    "add x21, x26, x13\n"
+    "add x20, x25, x13\n"
+    "st1 { v15.s }[2], [x24]\n"
+    "st1 { v16.s }[2], [x23]\n"
+    "st1 { v17.s }[2], [x22]\n"
+    "st1 { v18.s }[2], [x21]\n"
+    "st1 { v19.s }[2], [x20]\n"
+    "b 5f\n"
+    "4:"  // Output channel oddments: Store: Bit 1: Unset
+    "add x20, x12, x13\n"
+    "add x21, x11, x13\n"
+    "st1 { v12.s }[0], [x20]\n"
+    "add x20, x10, x13\n"
+    "add x24, x9, x13\n"
+    "st1 { v13.s }[0], [x21]\n"
+    "add x23, x28, x13\n"
+    "add x22, x27, x13\n"
+    "st1 { v14.s }[0], [x20]\n"
+    "add x21, x26, x13\n"
+    "add x20, x25, x13\n"
+    "st1 { v15.s }[0], [x24]\n"
+    "st1 { v16.s }[0], [x23]\n"
+    "st1 { v17.s }[0], [x22]\n"
+    "st1 { v18.s }[0], [x21]\n"
+    "st1 { v19.s }[0], [x20]\n"
+    "5:"  // Output channel oddments: Store: Bit 1: End
+    "6:"  // End
+    : [params] "+&r" (params)
+    : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..3bece73973
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
+
+struct a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<float, float, float, float>
+{
+  using Parent = GenericDepthfirstMultiplierKernelStrategy<float, float, float, float>;
+  a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+  : Parent(2, 8, arm_gemm::VLType::None)
+  {
+  }
+  Parent::KernelType kernel = a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..cc18dd4bb4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,850 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const float *weights,
+  const float *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ld1r { v12.4s }, [%x[minmax_vals]]\n"
+    "lsr x11, %x[n_output_channels], #0x2\n"
+    "add x20, %x[minmax_vals], #0x4\n"
+    "ld1r { v11.4s }, [x20]\n"
+    "mov x10, #0x0\n"
+    "cbz x11, 8f\n"
+    "1:"  // Output channel loop
+    "movi v31.16b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x20, x10, #0x2\n"
+    "ldr q31, [%x[bias], x20]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "ldr q10, [%x[weights], #0x0]\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr q3, [x21, #0x0]\n"
+    "ldr q2, [x21, #0x10]\n"
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "ldr q1, [x20, #0x0]\n"
+    "ldr q0, [x20, #0x10]\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz x23, 6f\n"
+    "ldr q9, [%x[weights], #0x0]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "ldr q8, [x21, #0x0]\n"
+    "ldr q7, [x21, #0x10]\n"
+    "ldr q6, [x20, #0x0]\n"
+    "ldr q5, [x20, #0x10]\n"
+    "beq 4f\n"
+    "3:"  // Output channel loop: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v16.4s, v10.4s, v3.s[0]\n"
+    "fmla v17.4s, v10.4s, v3.s[1]\n"
+    "subs x23, x23, #0x1\n"
+    "fmla v18.4s, v10.4s, v3.s[2]\n"
+    "fmla v19.4s, v10.4s, v3.s[3]\n"
+    "ldr q3, [x21, #0x0]\n"
+    "fmla v20.4s, v10.4s, v2.s[0]\n"
+    "fmla v21.4s, v10.4s, v2.s[1]\n"
+    "fmla v22.4s, v10.4s, v2.s[2]\n"
+    "fmla v23.4s, v10.4s, v2.s[3]\n"
+    "ldr q2, [x21, #0x10]\n"
+    "fmla v24.4s, v10.4s, v1.s[0]\n"
+    "fmla v25.4s, v10.4s, v1.s[1]\n"
+    "fmla v26.4s, v10.4s, v1.s[2]\n"
+    "fmla v27.4s, v10.4s, v1.s[3]\n"
+    "ldr q1, [x20, #0x0]\n"
+    "fmla v28.4s, v10.4s, v0.s[0]\n"
+    "fmla v29.4s, v10.4s, v0.s[1]\n"
+    "fmla v30.4s, v10.4s, v0.s[2]\n"
+    "fmla v31.4s, v10.4s, v0.s[3]\n"
+    "ldr q0, [x20, #0x10]\n"
+    "ldr q10, [%x[weights], #0x0]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v16.4s, v9.4s, v8.s[0]\n"
+    "fmla v17.4s, v9.4s, v8.s[1]\n"
+    "fmla v18.4s, v9.4s, v8.s[2]\n"
+    "fmla v19.4s, v9.4s, v8.s[3]\n"
+    "ldr q8, [x21, #0x0]\n"
+    "fmla v20.4s, v9.4s, v7.s[0]\n"
+    "fmla v21.4s, v9.4s, v7.s[1]\n"
+    "fmla v22.4s, v9.4s, v7.s[2]\n"
+    "fmla v23.4s, v9.4s, v7.s[3]\n"
+    "ldr q7, [x21, #0x10]\n"
+    "fmla v24.4s, v9.4s, v6.s[0]\n"
+    "fmla v25.4s, v9.4s, v6.s[1]\n"
+    "fmla v26.4s, v9.4s, v6.s[2]\n"
+    "fmla v27.4s, v9.4s, v6.s[3]\n"
+    "ldr q6, [x20, #0x0]\n"
+    "fmla v28.4s, v9.4s, v5.s[0]\n"
+    "fmla v29.4s, v9.4s, v5.s[1]\n"
+    "fmla v30.4s, v9.4s, v5.s[2]\n"
+    "fmla v31.4s, v9.4s, v5.s[3]\n"
+    "ldr q5, [x20, #0x10]\n"
+    "ldr q9, [%x[weights], #0x10]\n"
+    "add %x[weights], %x[weights], #0x20\n"
+    "bgt 3b\n"
+    "4:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 5f\n"
+    "fmla v16.4s, v10.4s, v3.s[0]\n"
+    "fmla v17.4s, v10.4s, v3.s[1]\n"
+    "lsl x28, x10, #0x2\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "fmla v18.4s, v10.4s, v3.s[2]\n"
+    "fmla v19.4s, v10.4s, v3.s[3]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "fmla v20.4s, v10.4s, v2.s[0]\n"
+    "fmla v21.4s, v10.4s, v2.s[1]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla v22.4s, v10.4s, v2.s[2]\n"
+    "fmla v23.4s, v10.4s, v2.s[3]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "fmla v24.4s, v10.4s, v1.s[0]\n"
+    "fmla v25.4s, v10.4s, v1.s[1]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmla v26.4s, v10.4s, v1.s[2]\n"
+    "fmla v27.4s, v10.4s, v1.s[3]\n"
+    "fmla v28.4s, v10.4s, v0.s[0]\n"
+    "fmla v29.4s, v10.4s, v0.s[1]\n"
+    "fmla v30.4s, v10.4s, v0.s[2]\n"
+    "fmla v31.4s, v10.4s, v0.s[3]\n"
+    "fmla v16.4s, v9.4s, v8.s[0]\n"
+    "fmla v17.4s, v9.4s, v8.s[1]\n"
+    "fmin v16.4s, v16.4s, v11.4s\n"
+    "fmla v18.4s, v9.4s, v8.s[2]\n"
+    "fmla v19.4s, v9.4s, v8.s[3]\n"
+    "fmin v17.4s, v17.4s, v11.4s\n"
+    "fmla v20.4s, v9.4s, v7.s[0]\n"
+    "fmla v21.4s, v9.4s, v7.s[1]\n"
+    "fmin v18.4s, v18.4s, v11.4s\n"
+    "fmla v22.4s, v9.4s, v7.s[2]\n"
+    "fmla v23.4s, v9.4s, v7.s[3]\n"
+    "fmin v19.4s, v19.4s, v11.4s\n"
+    "fmla v24.4s, v9.4s, v6.s[0]\n"
+    "fmla v25.4s, v9.4s, v6.s[1]\n"
+    "fmin v20.4s, v20.4s, v11.4s\n"
+    "fmla v26.4s, v9.4s, v6.s[2]\n"
+    "fmla v27.4s, v9.4s, v6.s[3]\n"
+    "fmin v21.4s, v21.4s, v11.4s\n"
+    "fmla v28.4s, v9.4s, v5.s[0]\n"
+    "fmla v29.4s, v9.4s, v5.s[1]\n"
+    "fmin v22.4s, v22.4s, v11.4s\n"
+    "fmla v30.4s, v9.4s, v5.s[2]\n"
+    "fmla v31.4s, v9.4s, v5.s[3]\n"
+    "fmin v23.4s, v23.4s, v11.4s\n"
+    "fmax v16.4s, v16.4s, v12.4s\n"
+    "fmax v17.4s, v17.4s, v12.4s\n"
+    "str q16, [x27, x28]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmax v18.4s, v18.4s, v12.4s\n"
+    "fmax v19.4s, v19.4s, v12.4s\n"
+    "str q17, [x26, x28]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmax v20.4s, v20.4s, v12.4s\n"
+    "fmax v21.4s, v21.4s, v12.4s\n"
+    "str q18, [x25, x28]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmax v22.4s, v22.4s, v12.4s\n"
+    "fmax v23.4s, v23.4s, v12.4s\n"
+    "str q19, [x24, x28]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "fmin v24.4s, v24.4s, v11.4s\n"
+    "fmin v25.4s, v25.4s, v11.4s\n"
+    "str q20, [x23, x28]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmin v26.4s, v26.4s, v11.4s\n"
+    "fmin v27.4s, v27.4s, v11.4s\n"
+    "str q21, [x22, x28]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmin v28.4s, v28.4s, v11.4s\n"
+    "fmin v29.4s, v29.4s, v11.4s\n"
+    "str q22, [x21, x28]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "fmin v30.4s, v30.4s, v11.4s\n"
+    "fmin v31.4s, v31.4s, v11.4s\n"
+    "str q23, [x20, x28]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax v24.4s, v24.4s, v12.4s\n"
+    "fmax v25.4s, v25.4s, v12.4s\n"
+    "str q24, [x27, x28]\n"
+    "fmax v26.4s, v26.4s, v12.4s\n"
+    "fmax v27.4s, v27.4s, v12.4s\n"
+    "str q25, [x26, x28]\n"
+    "fmax v28.4s, v28.4s, v12.4s\n"
+    "fmax v29.4s, v29.4s, v12.4s\n"
+    "str q26, [x25, x28]\n"
+    "fmax v30.4s, v30.4s, v12.4s\n"
+    "fmax v31.4s, v31.4s, v12.4s\n"
+    "str q27, [x24, x28]\n"
+    "str q28, [x23, x28]\n"
+    "str q29, [x22, x28]\n"
+    "str q30, [x21, x28]\n"
+    "str q31, [x20, x28]\n"
+    "b 7f\n"
+    "5:"  // Output channel loop: Odd tail
+    "fmla v16.4s, v10.4s, v3.s[0]\n"
+    "fmla v17.4s, v10.4s, v3.s[1]\n"
+    "ldp x20, x9, [x22], #0x10\n"
+    "lsl x28, x10, #0x2\n"
+    "fmla v18.4s, v10.4s, v3.s[2]\n"
+    "fmla v19.4s, v10.4s, v3.s[3]\n"
+    "ldr q4, [x20, #0x0]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "fmla v20.4s, v10.4s, v2.s[0]\n"
+    "fmla v21.4s, v10.4s, v2.s[1]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "fmla v22.4s, v10.4s, v2.s[2]\n"
+    "fmla v23.4s, v10.4s, v2.s[3]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "fmla v24.4s, v10.4s, v1.s[0]\n"
+    "fmla v25.4s, v10.4s, v1.s[1]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "fmla v26.4s, v10.4s, v1.s[2]\n"
+    "fmla v27.4s, v10.4s, v1.s[3]\n"
+    "ldr q2, [x9, #0x0]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "fmla v28.4s, v10.4s, v0.s[0]\n"
+    "fmla v29.4s, v10.4s, v0.s[1]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmla v30.4s, v10.4s, v0.s[2]\n"
+    "fmla v31.4s, v10.4s, v0.s[3]\n"
+    "ldr q1, [%x[weights], #0x0]\n"
+    "ldr q0, [x9, #0x10]\n"
+    "fmla v16.4s, v9.4s, v8.s[0]\n"
+    "fmla v17.4s, v9.4s, v8.s[1]\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "fmla v18.4s, v9.4s, v8.s[2]\n"
+    "fmla v19.4s, v9.4s, v8.s[3]\n"
+    "fmla v20.4s, v9.4s, v7.s[0]\n"
+    "fmla v21.4s, v9.4s, v7.s[1]\n"
+    "fmla v22.4s, v9.4s, v7.s[2]\n"
+    "fmla v23.4s, v9.4s, v7.s[3]\n"
+    "fmla v24.4s, v9.4s, v6.s[0]\n"
+    "fmla v25.4s, v9.4s, v6.s[1]\n"
+    "fmla v26.4s, v9.4s, v6.s[2]\n"
+    "fmla v27.4s, v9.4s, v6.s[3]\n"
+    "fmla v28.4s, v9.4s, v5.s[0]\n"
+    "fmla v29.4s, v9.4s, v5.s[1]\n"
+    "fmla v30.4s, v9.4s, v5.s[2]\n"
+    "fmla v31.4s, v9.4s, v5.s[3]\n"
+    "fmla v16.4s, v1.4s, v4.s[0]\n"
+    "fmla v17.4s, v1.4s, v4.s[1]\n"
+    "fmin v16.4s, v16.4s, v11.4s\n"
+    "fmla v18.4s, v1.4s, v4.s[2]\n"
+    "fmla v19.4s, v1.4s, v4.s[3]\n"
+    "fmin v17.4s, v17.4s, v11.4s\n"
+    "fmla v20.4s, v1.4s, v3.s[0]\n"
+    "fmla v21.4s, v1.4s, v3.s[1]\n"
+    "fmin v18.4s, v18.4s, v11.4s\n"
+    "fmla v22.4s, v1.4s, v3.s[2]\n"
+    "fmla v23.4s, v1.4s, v3.s[3]\n"
+    "fmin v19.4s, v19.4s, v11.4s\n"
+    "fmla v24.4s, v1.4s, v2.s[0]\n"
+    "fmla v25.4s, v1.4s, v2.s[1]\n"
+    "fmin v20.4s, v20.4s, v11.4s\n"
+    "fmla v26.4s, v1.4s, v2.s[2]\n"
+    "fmla v27.4s, v1.4s, v2.s[3]\n"
+    "fmin v21.4s, v21.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v0.s[0]\n"
+    "fmla v29.4s, v1.4s, v0.s[1]\n"
+    "fmin v22.4s, v22.4s, v11.4s\n"
+    "fmla v30.4s, v1.4s, v0.s[2]\n"
+    "fmla v31.4s, v1.4s, v0.s[3]\n"
+    "fmin v23.4s, v23.4s, v11.4s\n"
+    "fmax v16.4s, v16.4s, v12.4s\n"
+    "fmax v17.4s, v17.4s, v12.4s\n"
+    "str q16, [x27, x28]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmax v18.4s, v18.4s, v12.4s\n"
+    "fmax v19.4s, v19.4s, v12.4s\n"
+    "str q17, [x26, x28]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmax v20.4s, v20.4s, v12.4s\n"
+    "fmax v21.4s, v21.4s, v12.4s\n"
+    "str q18, [x25, x28]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmax v22.4s, v22.4s, v12.4s\n"
+    "fmax v23.4s, v23.4s, v12.4s\n"
+    "str q19, [x24, x28]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "fmin v24.4s, v24.4s, v11.4s\n"
+    "fmin v25.4s, v25.4s, v11.4s\n"
+    "str q20, [x23, x28]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmin v26.4s, v26.4s, v11.4s\n"
+    "fmin v27.4s, v27.4s, v11.4s\n"
+    "str q21, [x22, x28]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmin v28.4s, v28.4s, v11.4s\n"
+    "fmin v29.4s, v29.4s, v11.4s\n"
+    "str q22, [x21, x28]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "fmin v30.4s, v30.4s, v11.4s\n"
+    "fmin v31.4s, v31.4s, v11.4s\n"
+    "str q23, [x20, x28]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax v24.4s, v24.4s, v12.4s\n"
+    "fmax v25.4s, v25.4s, v12.4s\n"
+    "str q24, [x27, x28]\n"
+    "fmax v26.4s, v26.4s, v12.4s\n"
+    "fmax v27.4s, v27.4s, v12.4s\n"
+    "str q25, [x26, x28]\n"
+    "fmax v28.4s, v28.4s, v12.4s\n"
+    "fmax v29.4s, v29.4s, v12.4s\n"
+    "str q26, [x25, x28]\n"
+    "fmax v30.4s, v30.4s, v12.4s\n"
+    "fmax v31.4s, v31.4s, v12.4s\n"
+    "str q27, [x24, x28]\n"
+    "str q28, [x23, x28]\n"
+    "str q29, [x22, x28]\n"
+    "str q30, [x21, x28]\n"
+    "str q31, [x20, x28]\n"
+    "b 7f\n"
+    "6:"  // Output channel loop: Single kernel point
+    "fmla v16.4s, v10.4s, v3.s[0]\n"
+    "fmla v17.4s, v10.4s, v3.s[1]\n"
+    "fmin v16.4s, v16.4s, v11.4s\n"
+    "lsl x28, x10, #0x2\n"
+    "fmla v18.4s, v10.4s, v3.s[2]\n"
+    "fmla v19.4s, v10.4s, v3.s[3]\n"
+    "fmin v17.4s, v17.4s, v11.4s\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "fmla v20.4s, v10.4s, v2.s[0]\n"
+    "fmla v21.4s, v10.4s, v2.s[1]\n"
+    "fmin v18.4s, v18.4s, v11.4s\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "fmla v22.4s, v10.4s, v2.s[2]\n"
+    "fmla v23.4s, v10.4s, v2.s[3]\n"
+    "fmin v19.4s, v19.4s, v11.4s\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "fmla v24.4s, v10.4s, v1.s[0]\n"
+    "fmla v25.4s, v10.4s, v1.s[1]\n"
+    "fmin v20.4s, v20.4s, v11.4s\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "fmla v26.4s, v10.4s, v1.s[2]\n"
+    "fmla v27.4s, v10.4s, v1.s[3]\n"
+    "fmin v21.4s, v21.4s, v11.4s\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla v28.4s, v10.4s, v0.s[0]\n"
+    "fmla v29.4s, v10.4s, v0.s[1]\n"
+    "fmin v22.4s, v22.4s, v11.4s\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "fmla v30.4s, v10.4s, v0.s[2]\n"
+    "fmla v31.4s, v10.4s, v0.s[3]\n"
+    "fmin v23.4s, v23.4s, v11.4s\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmax v16.4s, v16.4s, v12.4s\n"
+    "fmax v17.4s, v17.4s, v12.4s\n"
+    "str q16, [x27, x28]\n"
+    "fmax v18.4s, v18.4s, v12.4s\n"
+    "fmax v19.4s, v19.4s, v12.4s\n"
+    "str q17, [x26, x28]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmax v20.4s, v20.4s, v12.4s\n"
+    "fmax v21.4s, v21.4s, v12.4s\n"
+    "str q18, [x25, x28]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmax v22.4s, v22.4s, v12.4s\n"
+    "fmax v23.4s, v23.4s, v12.4s\n"
+    "str q19, [x24, x28]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmin v24.4s, v24.4s, v11.4s\n"
+    "fmin v25.4s, v25.4s, v11.4s\n"
+    "str q20, [x23, x28]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "fmin v26.4s, v26.4s, v11.4s\n"
+    "fmin v27.4s, v27.4s, v11.4s\n"
+    "str q21, [x22, x28]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmin v28.4s, v28.4s, v11.4s\n"
+    "fmin v29.4s, v29.4s, v11.4s\n"
+    "str q22, [x21, x28]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmin v30.4s, v30.4s, v11.4s\n"
+    "fmin v31.4s, v31.4s, v11.4s\n"
+    "str q23, [x20, x28]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax v24.4s, v24.4s, v12.4s\n"
+    "fmax v25.4s, v25.4s, v12.4s\n"
+    "str q24, [x27, x28]\n"
+    "fmax v26.4s, v26.4s, v12.4s\n"
+    "fmax v27.4s, v27.4s, v12.4s\n"
+    "str q25, [x26, x28]\n"
+    "fmax v28.4s, v28.4s, v12.4s\n"
+    "fmax v29.4s, v29.4s, v12.4s\n"
+    "str q26, [x25, x28]\n"
+    "fmax v30.4s, v30.4s, v12.4s\n"
+    "fmax v31.4s, v31.4s, v12.4s\n"
+    "str q27, [x24, x28]\n"
+    "str q28, [x23, x28]\n"
+    "str q29, [x22, x28]\n"
+    "str q30, [x21, x28]\n"
+    "str q31, [x20, x28]\n"
+    "7:"  // Output channel loop: Done
+    "add x10, x10, #0x4\n"
+    "cmp x10, x11, LSL #2\n"
+    "blt 1b\n"
+    "tst %x[n_output_channels], #0x3\n"
+    "beq 19f\n"
+    "8:"  // Output channel oddments
+    "movi v31.16b, #0x0\n"
+    "cbz %x[bias], 11f\n"
+    "add x20, %x[bias], x10, LSL #2\n"
+    "tbz %x[n_output_channels], #1, 9f\n"
+    "ld1 { v31.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #0, 10f\n"
+    "ld1 { v31.s }[2], [x20]\n"
+    "b 10f\n"
+    "9:"  // Output channel oddments: Load bias: Bit 1: Unset
+    "ld1 { v31.s }[0], [x20]\n"
+    "10:"  // Output channel oddments: Load bias: Bit 1: End
+    "11:"  // Output channel oddments: Load bias: Done
+    "ldr q10, [%x[weights], #0x0]\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr q3, [x21, #0x0]\n"
+    "ldr q2, [x21, #0x10]\n"
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "ldr q1, [x20, #0x0]\n"
+    "ldr q0, [x20, #0x10]\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz x23, 15f\n"
+    "ldr q9, [%x[weights], #0x0]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "ldr q8, [x21, #0x0]\n"
+    "ldr q7, [x21, #0x10]\n"
+    "ldr q6, [x20, #0x0]\n"
+    "ldr q5, [x20, #0x10]\n"
+    "beq 13f\n"
+    "12:"  // Output channel oddments: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v16.4s, v10.4s, v3.s[0]\n"
+    "fmla v17.4s, v10.4s, v3.s[1]\n"
+    "subs x23, x23, #0x1\n"
+    "fmla v18.4s, v10.4s, v3.s[2]\n"
+    "fmla v19.4s, v10.4s, v3.s[3]\n"
+    "ldr q3, [x21, #0x0]\n"
+    "fmla v20.4s, v10.4s, v2.s[0]\n"
+    "fmla v21.4s, v10.4s, v2.s[1]\n"
+    "fmla v22.4s, v10.4s, v2.s[2]\n"
+    "fmla v23.4s, v10.4s, v2.s[3]\n"
+    "ldr q2, [x21, #0x10]\n"
+    "fmla v24.4s, v10.4s, v1.s[0]\n"
+    "fmla v25.4s, v10.4s, v1.s[1]\n"
+    "fmla v26.4s, v10.4s, v1.s[2]\n"
+    "fmla v27.4s, v10.4s, v1.s[3]\n"
+    "ldr q1, [x20, #0x0]\n"
+    "fmla v28.4s, v10.4s, v0.s[0]\n"
+    "fmla v29.4s, v10.4s, v0.s[1]\n"
+    "fmla v30.4s, v10.4s, v0.s[2]\n"
+    "fmla v31.4s, v10.4s, v0.s[3]\n"
+    "ldr q0, [x20, #0x10]\n"
+    "ldr q10, [%x[weights], #0x0]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v16.4s, v9.4s, v8.s[0]\n"
+    "fmla v17.4s, v9.4s, v8.s[1]\n"
+    "fmla v18.4s, v9.4s, v8.s[2]\n"
+    "fmla v19.4s, v9.4s, v8.s[3]\n"
+    "ldr q8, [x21, #0x0]\n"
+    "fmla v20.4s, v9.4s, v7.s[0]\n"
+    "fmla v21.4s, v9.4s, v7.s[1]\n"
+    "fmla v22.4s, v9.4s, v7.s[2]\n"
+    "fmla v23.4s, v9.4s, v7.s[3]\n"
+    "ldr q7, [x21, #0x10]\n"
+    "fmla v24.4s, v9.4s, v6.s[0]\n"
+    "fmla v25.4s, v9.4s, v6.s[1]\n"
+    "fmla v26.4s, v9.4s, v6.s[2]\n"
+    "fmla v27.4s, v9.4s, v6.s[3]\n"
+    "ldr q6, [x20, #0x0]\n"
+    "fmla v28.4s, v9.4s, v5.s[0]\n"
+    "fmla v29.4s, v9.4s, v5.s[1]\n"
+    "fmla v30.4s, v9.4s, v5.s[2]\n"
+    "fmla v31.4s, v9.4s, v5.s[3]\n"
+    "ldr q5, [x20, #0x10]\n"
+    "ldr q9, [%x[weights], #0x10]\n"
+    "add %x[weights], %x[weights], #0x20\n"
+    "bgt 12b\n"
+    "13:"  // Output channel oddments: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 14f\n"
+    "fmla v16.4s, v10.4s, v3.s[0]\n"
+    "fmla v17.4s, v10.4s, v3.s[1]\n"
+    "fmla v18.4s, v10.4s, v3.s[2]\n"
+    "fmla v19.4s, v10.4s, v3.s[3]\n"
+    "fmla v20.4s, v10.4s, v2.s[0]\n"
+    "fmla v21.4s, v10.4s, v2.s[1]\n"
+    "fmla v22.4s, v10.4s, v2.s[2]\n"
+    "fmla v23.4s, v10.4s, v2.s[3]\n"
+    "fmla v24.4s, v10.4s, v1.s[0]\n"
+    "fmla v25.4s, v10.4s, v1.s[1]\n"
+    "fmla v26.4s, v10.4s, v1.s[2]\n"
+    "fmla v27.4s, v10.4s, v1.s[3]\n"
+    "fmla v28.4s, v10.4s, v0.s[0]\n"
+    "fmla v29.4s, v10.4s, v0.s[1]\n"
+    "fmla v30.4s, v10.4s, v0.s[2]\n"
+    "fmla v31.4s, v10.4s, v0.s[3]\n"
+    "fmla v16.4s, v9.4s, v8.s[0]\n"
+    "fmla v17.4s, v9.4s, v8.s[1]\n"
+    "fmla v18.4s, v9.4s, v8.s[2]\n"
+    "fmla v19.4s, v9.4s, v8.s[3]\n"
+    "fmla v20.4s, v9.4s, v7.s[0]\n"
+    "fmla v21.4s, v9.4s, v7.s[1]\n"
+    "fmla v22.4s, v9.4s, v7.s[2]\n"
+    "fmla v23.4s, v9.4s, v7.s[3]\n"
+    "fmla v24.4s, v9.4s, v6.s[0]\n"
+    "fmla v25.4s, v9.4s, v6.s[1]\n"
+    "fmla v26.4s, v9.4s, v6.s[2]\n"
+    "fmla v27.4s, v9.4s, v6.s[3]\n"
+    "fmla v28.4s, v9.4s, v5.s[0]\n"
+    "fmla v29.4s, v9.4s, v5.s[1]\n"
+    "fmla v30.4s, v9.4s, v5.s[2]\n"
+    "fmla v31.4s, v9.4s, v5.s[3]\n"
+    "b 16f\n"
+    "14:"  // Output channel oddments: Odd tail
+    "fmla v16.4s, v10.4s, v3.s[0]\n"
+    "fmla v17.4s, v10.4s, v3.s[1]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v18.4s, v10.4s, v3.s[2]\n"
+    "fmla v19.4s, v10.4s, v3.s[3]\n"
+    "ldr q4, [x21, #0x0]\n"
+    "fmla v20.4s, v10.4s, v2.s[0]\n"
+    "fmla v21.4s, v10.4s, v2.s[1]\n"
+    "fmla v22.4s, v10.4s, v2.s[2]\n"
+    "fmla v23.4s, v10.4s, v2.s[3]\n"
+    "ldr q3, [x21, #0x10]\n"
+    "fmla v24.4s, v10.4s, v1.s[0]\n"
+    "fmla v25.4s, v10.4s, v1.s[1]\n"
+    "fmla v26.4s, v10.4s, v1.s[2]\n"
+    "fmla v27.4s, v10.4s, v1.s[3]\n"
+    "ldr q2, [x20, #0x0]\n"
+    "fmla v28.4s, v10.4s, v0.s[0]\n"
+    "fmla v29.4s, v10.4s, v0.s[1]\n"
+    "fmla v30.4s, v10.4s, v0.s[2]\n"
+    "fmla v31.4s, v10.4s, v0.s[3]\n"
+    "ldr q1, [x20, #0x10]\n"
+    "ldr q0, [%x[weights], #0x0]\n"
+    "fmla v16.4s, v9.4s, v8.s[0]\n"
+    "fmla v17.4s, v9.4s, v8.s[1]\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "fmla v18.4s, v9.4s, v8.s[2]\n"
+    "fmla v19.4s, v9.4s, v8.s[3]\n"
+    "fmla v20.4s, v9.4s, v7.s[0]\n"
+    "fmla v21.4s, v9.4s, v7.s[1]\n"
+    "fmla v22.4s, v9.4s, v7.s[2]\n"
+    "fmla v23.4s, v9.4s, v7.s[3]\n"
+    "fmla v24.4s, v9.4s, v6.s[0]\n"
+    "fmla v25.4s, v9.4s, v6.s[1]\n"
+    "fmla v26.4s, v9.4s, v6.s[2]\n"
+    "fmla v27.4s, v9.4s, v6.s[3]\n"
+    "fmla v28.4s, v9.4s, v5.s[0]\n"
+    "fmla v29.4s, v9.4s, v5.s[1]\n"
+    "fmla v30.4s, v9.4s, v5.s[2]\n"
+    "fmla v31.4s, v9.4s, v5.s[3]\n"
+    "fmla v16.4s, v0.4s, v4.s[0]\n"
+    "fmla v17.4s, v0.4s, v4.s[1]\n"
+    "fmla v18.4s, v0.4s, v4.s[2]\n"
+    "fmla v19.4s, v0.4s, v4.s[3]\n"
+    "fmla v20.4s, v0.4s, v3.s[0]\n"
+    "fmla v21.4s, v0.4s, v3.s[1]\n"
+    "fmla v22.4s, v0.4s, v3.s[2]\n"
+    "fmla v23.4s, v0.4s, v3.s[3]\n"
+    "fmla v24.4s, v0.4s, v2.s[0]\n"
+    "fmla v25.4s, v0.4s, v2.s[1]\n"
+    "fmla v26.4s, v0.4s, v2.s[2]\n"
+    "fmla v27.4s, v0.4s, v2.s[3]\n"
+    "fmla v28.4s, v0.4s, v1.s[0]\n"
+    "fmla v29.4s, v0.4s, v1.s[1]\n"
+    "fmla v30.4s, v0.4s, v1.s[2]\n"
+    "fmla v31.4s, v0.4s, v1.s[3]\n"
+    "b 16f\n"
+    "15:"  // Output channel oddments: Single kernel point
+    "fmla v16.4s, v10.4s, v3.s[0]\n"
+    "fmla v17.4s, v10.4s, v3.s[1]\n"
+    "fmla v18.4s, v10.4s, v3.s[2]\n"
+    "fmla v19.4s, v10.4s, v3.s[3]\n"
+    "fmla v20.4s, v10.4s, v2.s[0]\n"
+    "fmla v21.4s, v10.4s, v2.s[1]\n"
+    "fmla v22.4s, v10.4s, v2.s[2]\n"
+    "fmla v23.4s, v10.4s, v2.s[3]\n"
+    "fmla v24.4s, v10.4s, v1.s[0]\n"
+    "fmla v25.4s, v10.4s, v1.s[1]\n"
+    "fmla v26.4s, v10.4s, v1.s[2]\n"
+    "fmla v27.4s, v10.4s, v1.s[3]\n"
+    "fmla v28.4s, v10.4s, v0.s[0]\n"
+    "fmla v29.4s, v10.4s, v0.s[1]\n"
+    "fmla v30.4s, v10.4s, v0.s[2]\n"
+    "fmla v31.4s, v10.4s, v0.s[3]\n"
+    "16:"  // Output channel oddments: Done
+    "fmin v16.4s, v16.4s, v11.4s\n"
+    "fmin v17.4s, v17.4s, v11.4s\n"
+    "fmin v18.4s, v18.4s, v11.4s\n"
+    "fmin v19.4s, v19.4s, v11.4s\n"
+    "fmin v20.4s, v20.4s, v11.4s\n"
+    "fmin v21.4s, v21.4s, v11.4s\n"
+    "fmin v22.4s, v22.4s, v11.4s\n"
+    "fmin v23.4s, v23.4s, v11.4s\n"
+    "fmin v24.4s, v24.4s, v11.4s\n"
+    "fmin v25.4s, v25.4s, v11.4s\n"
+    "fmin v26.4s, v26.4s, v11.4s\n"
+    "fmin v27.4s, v27.4s, v11.4s\n"
+    "fmin v28.4s, v28.4s, v11.4s\n"
+    "fmin v29.4s, v29.4s, v11.4s\n"
+    "fmin v30.4s, v30.4s, v11.4s\n"
+    "fmin v31.4s, v31.4s, v11.4s\n"
+    "fmax v16.4s, v16.4s, v12.4s\n"
+    "fmax v17.4s, v17.4s, v12.4s\n"
+    "fmax v18.4s, v18.4s, v12.4s\n"
+    "fmax v19.4s, v19.4s, v12.4s\n"
+    "fmax v20.4s, v20.4s, v12.4s\n"
+    "fmax v21.4s, v21.4s, v12.4s\n"
+    "fmax v22.4s, v22.4s, v12.4s\n"
+    "fmax v23.4s, v23.4s, v12.4s\n"
+    "fmax v24.4s, v24.4s, v12.4s\n"
+    "fmax v25.4s, v25.4s, v12.4s\n"
+    "fmax v26.4s, v26.4s, v12.4s\n"
+    "fmax v27.4s, v27.4s, v12.4s\n"
+    "fmax v28.4s, v28.4s, v12.4s\n"
+    "fmax v29.4s, v29.4s, v12.4s\n"
+    "fmax v30.4s, v30.4s, v12.4s\n"
+    "fmax v31.4s, v31.4s, v12.4s\n"
+    "tbz %x[n_output_channels], #1, 17f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #2\n"
+    "add x26, x26, x10, LSL #2\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "add x24, x24, x10, LSL #2\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "add x22, x22, x10, LSL #2\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "add x20, x20, x10, LSL #2\n"
+    "st1 { v16.d }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #2\n"
+    "st1 { v17.d }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #2\n"
+    "st1 { v18.d }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "st1 { v19.d }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #2\n"
+    "st1 { v20.d }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "st1 { v21.d }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #2\n"
+    "st1 { v22.d }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "st1 { v23.d }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #2\n"
+    "add x10, x10, #0x2\n"
+    "st1 { v24.d }[0], [x27]\n"
+    "st1 { v25.d }[0], [x26]\n"
+    "st1 { v26.d }[0], [x25]\n"
+    "st1 { v27.d }[0], [x24]\n"
+    "st1 { v28.d }[0], [x23]\n"
+    "st1 { v29.d }[0], [x22]\n"
+    "st1 { v30.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_output_channels], #0, 18f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #2\n"
+    "add x26, x26, x10, LSL #2\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "add x24, x24, x10, LSL #2\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "add x22, x22, x10, LSL #2\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "add x20, x20, x10, LSL #2\n"
+    "st1 { v16.s }[2], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #2\n"
+    "st1 { v17.s }[2], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #2\n"
+    "st1 { v18.s }[2], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "st1 { v19.s }[2], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #2\n"
+    "st1 { v20.s }[2], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "st1 { v21.s }[2], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #2\n"
+    "st1 { v22.s }[2], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "st1 { v23.s }[2], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #2\n"
+    "st1 { v24.s }[2], [x27]\n"
+    "st1 { v25.s }[2], [x26]\n"
+    "st1 { v26.s }[2], [x25]\n"
+    "st1 { v27.s }[2], [x24]\n"
+    "st1 { v28.s }[2], [x23]\n"
+    "st1 { v29.s }[2], [x22]\n"
+    "st1 { v30.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "b 18f\n"
+    "17:"  // Output channel oddments: Done: Store: Bit 1: Unset
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #2\n"
+    "add x26, x26, x10, LSL #2\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "add x24, x24, x10, LSL #2\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "add x22, x22, x10, LSL #2\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "add x20, x20, x10, LSL #2\n"
+    "st1 { v16.s }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #2\n"
+    "st1 { v17.s }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #2\n"
+    "st1 { v18.s }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "st1 { v19.s }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #2\n"
+    "st1 { v20.s }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "st1 { v21.s }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #2\n"
+    "st1 { v22.s }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "st1 { v23.s }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #2\n"
+    "st1 { v24.s }[0], [x27]\n"
+    "st1 { v25.s }[0], [x26]\n"
+    "st1 { v26.s }[0], [x25]\n"
+    "st1 { v27.s }[0], [x24]\n"
+    "st1 { v28.s }[0], [x23]\n"
+    "st1 { v29.s }[0], [x22]\n"
+    "st1 { v30.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "18:"  // Output channel oddments: Done: Store: Bit 1: End
+    "19:"  // Done
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..e51031ccdb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int, const int8_t *const *const, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *const);
+
+class a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleave_a64_s8q_3x3_dot::get_packed_size(args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleave_a64_s8q_3x3_dot::pack_parameters(
+      args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+      reinterpret_cast<const int8_t *>(weights), qp, ld_weight_col, ld_weight_row
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..916c8a4afe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,1658 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
+{
+  __asm__ __volatile__(
+    "mov x20, #0x1\n"
+    "orr x20, x20, #0x100\n"
+    "ldp x15, x14, [%x[inptrs], #0x0]\n"
+    "ldp x13, x12, [%x[inptrs], #0x10]\n"
+    "orr x20, x20, #0x10000\n"
+    "lsr x11, %x[n_channels], #0x4\n"
+    "dup v12.4s, w20\n"
+    "ldp x10, x9, [%x[inptrs], #0x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v13.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v11.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "mov x28, #0x0\n"
+    "mov x27, #0x0\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "ldp x25, x24, [%x[outptrs], #0x0]\n"
+    "ldp x23, x22, [%x[outptrs], #0x10]\n"
+    "cbz x11, 3f\n"
+    "ldr q15, [x15, x28]\n"
+    "ldr q28, [x14, x28]\n"
+    "subs x11, x11, #0x1\n"
+    "ldr q30, [x13, x28]\n"
+    "ldr q8, [x12, x28]\n"
+    "zip2 v19.16b, v15.16b, v30.16b\n"
+    "zip1 v15.16b, v15.16b, v30.16b\n"
+    "ldr q26, [x10, x28]\n"
+    "ldr q0, [x9, x28]\n"
+    "zip1 v7.16b, v28.16b, v8.16b\n"
+    "zip2 v8.16b, v28.16b, v8.16b\n"
+    "ldr q29, [x26, x28]\n"
+    "ldr q10, [x21, x28]\n"
+    "zip2 v25.16b, v15.16b, v7.16b\n"
+    "zip1 v15.16b, v15.16b, v7.16b\n"
+    "ldr q1, [%x[params], #0x10]\n"
+    "ldr q6, [%x[params], #0x20]\n"
+    "zip1 v7.16b, v19.16b, v8.16b\n"
+    "zip2 v8.16b, v19.16b, v8.16b\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q20, [%x[params], #0x30]\n"
+    "zip2 v21.16b, v26.16b, v29.16b\n"
+    "zip1 v26.16b, v26.16b, v29.16b\n"
+    "ldp x21, x20, [%x[inptrs], #0x40]\n"
+    "ldr q22, [x21, x28]\n"
+    "zip1 v27.16b, v0.16b, v10.16b\n"
+    "zip2 v10.16b, v0.16b, v10.16b\n"
+    "ldr q17, [x20, x28]\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "zip2 v23.16b, v26.16b, v27.16b\n"
+    "zip1 v26.16b, v26.16b, v27.16b\n"
+    "ldr q9, [x21, x28]\n"
+    "ldr q5, [x20, x28]\n"
+    "zip2 v28.16b, v22.16b, v9.16b\n"
+    "zip1 v22.16b, v22.16b, v9.16b\n"
+    "ldp x21, x20, [%x[inptrs], #0x60]\n"
+    "ldr q27, [x21, x28]\n"
+    "zip1 v24.16b, v17.16b, v5.16b\n"
+    "zip2 v5.16b, v17.16b, v5.16b\n"
+    "ldr q18, [x20, x28]\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "zip1 v3.16b, v21.16b, v10.16b\n"
+    "zip2 v10.16b, v21.16b, v10.16b\n"
+    "ldr q4, [x21, x28]\n"
+    "ldr q9, [x20, x28]\n"
+    "zip2 v17.16b, v27.16b, v4.16b\n"
+    "zip1 v27.16b, v27.16b, v4.16b\n"
+    "zip1 v4.16b, v18.16b, v9.16b\n"
+    "zip2 v9.16b, v18.16b, v9.16b\n"
+    "ldp x15, x14, [%x[inptrs], #0x0]\n"
+    "ldp x13, x12, [%x[inptrs], #0x10]\n"
+    "ldp x10, x9, [%x[inptrs], #0x20]\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "zip2 v19.16b, v22.16b, v24.16b\n"
+    "zip1 v22.16b, v22.16b, v24.16b\n"
+    "zip1 v0.16b, v28.16b, v5.16b\n"
+    "zip2 v5.16b, v28.16b, v5.16b\n"
+    "add %x[params], %x[params], #0x40\n"
+    "zip2 v24.16b, v27.16b, v4.16b\n"
+    "zip1 v27.16b, v27.16b, v4.16b\n"
+    "zip1 v2.16b, v17.16b, v9.16b\n"
+    "zip2 v9.16b, v17.16b, v9.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "movi v21.4s, #0x0\n"
+    ".inst 0x4e9a9595  // sdot v21.4s, v12.16b, v26.16b\n"
+    ".inst 0x4e8f943f  // sdot v31.4s, v1.16b, v15.16b\n"
+    "add x28, x28, #0x10\n"
+    ".inst 0x4e969595  // sdot v21.4s, v12.16b, v22.16b\n"
+    ".inst 0x4e9a943d  // sdot v29.4s, v1.16b, v26.16b\n"
+    "movi v18.4s, #0x0\n"
+    "subs x11, x11, #0x1\n"
+    ".inst 0x4e9a94df  // sdot v31.4s, v6.16b, v26.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "mov v17.16b, v21.16b\n .inst 0x4e9b9591  // sdot v17.4s, v12.16b, v27.16b\n"
+    ".inst 0x4e8f9595  // sdot v21.4s, v12.16b, v15.16b\n"
+    "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+    ".inst 0x4e9a9592  // sdot v18.4s, v12.16b, v26.16b\n"
+    ".inst 0x4e9694dd  // sdot v29.4s, v6.16b, v22.16b\n"
+    ".inst 0x4e96969f  // sdot v31.4s, v20.16b, v22.16b\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+    ".inst 0x4e8f943e  // sdot v30.4s, v1.16b, v15.16b\n"
+    ".inst 0x4e9a943c  // sdot v28.4s, v1.16b, v26.16b\n"
+    "mls v31.4s, v21.4s, v16.4s\n"
+    ".inst 0x4e969592  // sdot v18.4s, v12.16b, v22.16b\n"
+    ".inst 0x4e9b969d  // sdot v29.4s, v20.16b, v27.16b\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x4e9a94de  // sdot v30.4s, v6.16b, v26.16b\n"
+    "ldr q26, [%x[params], #0x10]\n"
+    ".inst 0x4e9694dc  // sdot v28.4s, v6.16b, v22.16b\n"
+    "mls v29.4s, v17.4s, v16.4s\n"
+    "mov v21.16b, v18.16b\n .inst 0x4e9b9595  // sdot v21.4s, v12.16b, v27.16b\n"
+    ".inst 0x4e8f9592  // sdot v18.4s, v12.16b, v15.16b\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x4e96969e  // sdot v30.4s, v20.16b, v22.16b\n"
+    ".inst 0x4e9b969c  // sdot v28.4s, v20.16b, v27.16b\n"
+    "mls v30.4s, v18.4s, v16.4s\n"
+    "mls v28.4s, v21.4s, v16.4s\n"
+    "and v15.16b, v31.16b, v26.16b\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+    "ldr q1, [%x[params], #0x60]\n"
+    "sqadd v31.4s, v31.4s, v15.4s\n"
+    "and v18.16b, v30.16b, v26.16b\n"
+    "and v21.16b, v29.16b, v26.16b\n"
+    "and v17.16b, v28.16b, v26.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v26.4s\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "ldr q18, [%x[params], #0x40]\n"
+    "sqadd v29.4s, v29.4s, v21.4s\n"
+    "ldr q27, [%x[params], #0x50]\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "ldr q15, [%x[params], #0x30]\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v26.4s\n"
+    "srshl v29.4s, v29.4s, v26.4s\n"
+    "srshl v28.4s, v28.4s, v26.4s\n"
+    "ldr q20, [%x[params], #0x70]\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "movi v22.4s, #0x0\n"
+    ".inst 0x4e979596  // sdot v22.4s, v12.16b, v23.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s31, [x25, x27]\n"
+    "ldr q26, [%x[params], #0x20]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    ".inst 0x4e939596  // sdot v22.4s, v12.16b, v19.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s30, [x24, x27]\n"
+    "mov v6.16b, v22.16b\n .inst 0x4e989586  // sdot v6.4s, v12.16b, v24.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s29, [x23, x27]\n"
+    "mov v30.16b, v26.16b\n"
+    ".inst 0x4e999596  // sdot v22.4s, v12.16b, v25.16b\n"
+    "str s28, [x22, x27]\n"
+    "mov v29.16b, v26.16b\n"
+    "mov v21.16b, v26.16b\n"
+    ".inst 0x4e9995fa  // sdot v26.4s, v15.16b, v25.16b\n"
+    ".inst 0x4e9795fd  // sdot v29.4s, v15.16b, v23.16b\n"
+    ".inst 0x4e97965a  // sdot v26.4s, v18.16b, v23.16b\n"
+    "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+    "movi v28.4s, #0x0\n"
+    ".inst 0x4e9995fe  // sdot v30.4s, v15.16b, v25.16b\n"
+    ".inst 0x4e9795f5  // sdot v21.4s, v15.16b, v23.16b\n"
+    ".inst 0x4e97959c  // sdot v28.4s, v12.16b, v23.16b\n"
+    ".inst 0x4e93965d  // sdot v29.4s, v18.16b, v19.16b\n"
+    ".inst 0x4e93977a  // sdot v26.4s, v27.16b, v19.16b\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+    ".inst 0x4e97965e  // sdot v30.4s, v18.16b, v23.16b\n"
+    "ldr q4, [x9, x28]\n"
+    ".inst 0x4e939655  // sdot v21.4s, v18.16b, v19.16b\n"
+    "mls v26.4s, v22.4s, v16.4s\n"
+    ".inst 0x4e93959c  // sdot v28.4s, v12.16b, v19.16b\n"
+    ".inst 0x4e98977d  // sdot v29.4s, v27.16b, v24.16b\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x4e93977e  // sdot v30.4s, v27.16b, v19.16b\n"
+    ".inst 0x4e989775  // sdot v21.4s, v27.16b, v24.16b\n"
+    "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+    "mov v17.16b, v28.16b\n .inst 0x4e989591  // sdot v17.4s, v12.16b, v24.16b\n"
+    ".inst 0x4e99959c  // sdot v28.4s, v12.16b, v25.16b\n"
+    "ldr q31, [x14, x28]\n"
+    "mls v30.4s, v28.4s, v16.4s\n"
+    "mls v29.4s, v6.4s, v16.4s\n"
+    "mls v21.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v26.16b, v20.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+    "ldr q27, [%x[params], #0xc0]\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "and v18.16b, v30.16b, v20.16b\n"
+    "and v6.16b, v29.16b, v20.16b\n"
+    "and v17.16b, v21.16b, v20.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "ldr q28, [%x[params], #0xa0]\n"
+    "sqadd v29.4s, v29.4s, v6.4s\n"
+    "ldr q24, [%x[params], #0xb0]\n"
+    "sqadd v21.4s, v21.4s, v17.4s\n"
+    "ldr q15, [%x[params], #0x90]\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "srshl v29.4s, v29.4s, v20.4s\n"
+    "srshl v21.4s, v21.4s, v20.4s\n"
+    "ldr q1, [%x[params], #0xd0]\n"
+    "smax v26.4s, v26.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v11.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v21.4s, v21.4s, v13.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "movi v22.4s, #0x0\n"
+    ".inst 0x4e839596  // sdot v22.4s, v12.16b, v3.16b\n"
+    ".inst 0x4e809596  // sdot v22.4s, v12.16b, v0.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s26, [x25, x27]\n"
+    "ldr q26, [%x[params], #0x80]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "mov v18.16b, v22.16b\n .inst 0x4e829592  // sdot v18.4s, v12.16b, v2.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s30, [x24, x27]\n"
+    ".inst 0x4e879596  // sdot v22.4s, v12.16b, v7.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s29, [x23, x27]\n"
+    "mov v6.16b, v26.16b\n"
+    "str s21, [x22, x27]\n"
+    "mov v25.16b, v26.16b\n"
+    "mov v20.16b, v26.16b\n"
+    ".inst 0x4e8795fa  // sdot v26.4s, v15.16b, v7.16b\n"
+    ".inst 0x4e8395f9  // sdot v25.4s, v15.16b, v3.16b\n"
+    ".inst 0x4e83979a  // sdot v26.4s, v28.16b, v3.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "movi v23.4s, #0x0\n"
+    ".inst 0x4e8795e6  // sdot v6.4s, v15.16b, v7.16b\n"
+    ".inst 0x4e8395f4  // sdot v20.4s, v15.16b, v3.16b\n"
+    ".inst 0x4e839597  // sdot v23.4s, v12.16b, v3.16b\n"
+    ".inst 0x4e809799  // sdot v25.4s, v28.16b, v0.16b\n"
+    ".inst 0x4e80971a  // sdot v26.4s, v24.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x4e839786  // sdot v6.4s, v28.16b, v3.16b\n"
+    "ldr q19, [x26, x28]\n"
+    ".inst 0x4e809794  // sdot v20.4s, v28.16b, v0.16b\n"
+    "mls v26.4s, v22.4s, v16.4s\n"
+    ".inst 0x4e809597  // sdot v23.4s, v12.16b, v0.16b\n"
+    ".inst 0x4e829719  // sdot v25.4s, v24.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x4e809706  // sdot v6.4s, v24.16b, v0.16b\n"
+    ".inst 0x4e829714  // sdot v20.4s, v24.16b, v2.16b\n"
+    "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+    "mov v17.16b, v23.16b\n .inst 0x4e829591  // sdot v17.4s, v12.16b, v2.16b\n"
+    ".inst 0x4e879597  // sdot v23.4s, v12.16b, v7.16b\n"
+    "ldr q21, [x13, x28]\n"
+    "mls v6.4s, v23.4s, v16.4s\n"
+    "mls v25.4s, v18.4s, v16.4s\n"
+    "mls v20.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v26.16b, v1.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v6.4s, v6.4s, v27.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+    "ldr q15, [%x[params], #0x120]\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "and v18.16b, v6.16b, v1.16b\n"
+    "and v22.16b, v25.16b, v1.16b\n"
+    "and v17.16b, v20.16b, v1.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "ldr q30, [%x[params], #0x100]\n"
+    "sqadd v25.4s, v25.4s, v22.4s\n"
+    "ldr q27, [%x[params], #0x110]\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "ldr q24, [%x[params], #0xf0]\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "srshl v6.4s, v6.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v20.4s, v20.4s, v1.4s\n"
+    "ldr q23, [%x[params], #0x130]\n"
+    "smax v26.4s, v26.4s, v13.4s\n"
+    "add v6.4s, v6.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v11.4s\n"
+    "smax v6.4s, v6.4s, v13.4s\n"
+    "smax v25.4s, v25.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v13.4s\n"
+    "smin v6.4s, v6.4s, v11.4s\n"
+    "smin v25.4s, v25.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "movi v0.4s, #0x0\n"
+    ".inst 0x4e8a9580  // sdot v0.4s, v12.16b, v10.16b\n"
+    ".inst 0x4e859580  // sdot v0.4s, v12.16b, v5.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "str s26, [x25, x27]\n"
+    "ldr q28, [%x[params], #0xe0]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "mov v22.16b, v0.16b\n .inst 0x4e899596  // sdot v22.4s, v12.16b, v9.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s6, [x24, x27]\n"
+    ".inst 0x4e889580  // sdot v0.4s, v12.16b, v8.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s25, [x23, x27]\n"
+    "mov v29.16b, v28.16b\n"
+    "str s20, [x22, x27]\n"
+    "mov v25.16b, v28.16b\n"
+    "mov v7.16b, v28.16b\n"
+    ".inst 0x4e88971c  // sdot v28.4s, v24.16b, v8.16b\n"
+    ".inst 0x4e8a9719  // sdot v25.4s, v24.16b, v10.16b\n"
+    ".inst 0x4e8a97dc  // sdot v28.4s, v30.16b, v10.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x4e88971d  // sdot v29.4s, v24.16b, v8.16b\n"
+    ".inst 0x4e8a9707  // sdot v7.4s, v24.16b, v10.16b\n"
+    ".inst 0x4e8a9591  // sdot v17.4s, v12.16b, v10.16b\n"
+    ".inst 0x4e8597d9  // sdot v25.4s, v30.16b, v5.16b\n"
+    ".inst 0x4e85977c  // sdot v28.4s, v27.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x4e8a97dd  // sdot v29.4s, v30.16b, v10.16b\n"
+    "ldr q10, [x21, x28]\n"
+    ".inst 0x4e8597c7  // sdot v7.4s, v30.16b, v5.16b\n"
+    "mls v28.4s, v0.4s, v16.4s\n"
+    ".inst 0x4e859591  // sdot v17.4s, v12.16b, v5.16b\n"
+    ".inst 0x4e899779  // sdot v25.4s, v27.16b, v9.16b\n"
+    "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+    ".inst 0x4e85977d  // sdot v29.4s, v27.16b, v5.16b\n"
+    ".inst 0x4e899767  // sdot v7.4s, v27.16b, v9.16b\n"
+    "sqrdmulh v28.4s, v28.4s, v15.4s\n"
+    "mov v18.16b, v17.16b\n .inst 0x4e899592  // sdot v18.4s, v12.16b, v9.16b\n"
+    ".inst 0x4e889591  // sdot v17.4s, v12.16b, v8.16b\n"
+    "ldr q8, [x12, x28]\n"
+    "mls v29.4s, v17.4s, v16.4s\n"
+    "mls v25.4s, v22.4s, v16.4s\n"
+    "mls v7.4s, v18.4s, v16.4s\n"
+    "and v17.16b, v28.16b, v23.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v15.4s\n"
+    "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+    "ldr q15, [x15, x28]\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "ldp x21, x20, [%x[inptrs], #0x40]\n"
+    "ldr q22, [x21, x28]\n"
+    "ldr q3, [x20, x28]\n"
+    "and v24.16b, v29.16b, v23.16b\n"
+    "and v20.16b, v25.16b, v23.16b\n"
+    "and v17.16b, v7.16b, v23.16b\n"
+    "sshr v24.4s, v24.4s, #0x1f\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "ldr q2, [x21, x28]\n"
+    "ldr q5, [x20, x28]\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v28.4s, v28.4s, v23.4s\n"
+    "sqadd v29.4s, v29.4s, v24.4s\n"
+    "ldr q6, [%x[params], #0x160]\n"
+    "sqadd v25.4s, v25.4s, v20.4s\n"
+    "ldr q20, [%x[params], #0x170]\n"
+    "sqadd v7.4s, v7.4s, v17.4s\n"
+    "ldr q1, [%x[params], #0x150]\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "srshl v29.4s, v29.4s, v23.4s\n"
+    "srshl v25.4s, v25.4s, v23.4s\n"
+    "srshl v7.4s, v7.4s, v23.4s\n"
+    "ldr q26, [x10, x28]\n"
+    "ldp x21, x20, [%x[inptrs], #0x60]\n"
+    "ldr q27, [x21, x28]\n"
+    "ldr q30, [x20, x28]\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "add v7.4s, v7.4s, v14.4s\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "ldr q23, [x21, x28]\n"
+    "ldr q9, [x20, x28]\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "ldp x15, x14, [%x[inptrs], #0x0]\n"
+    "smax v25.4s, v25.4s, v13.4s\n"
+    "smax v7.4s, v7.4s, v13.4s\n"
+    "ldp x13, x12, [%x[inptrs], #0x10]\n"
+    "ldp x10, x9, [%x[inptrs], #0x20]\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v25.4s, v25.4s, v11.4s\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "smin v7.4s, v7.4s, v11.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s28, [x25, x27]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "zip2 v17.16b, v15.16b, v21.16b\n"
+    "zip1 v15.16b, v15.16b, v21.16b\n"
+    "zip1 v18.16b, v31.16b, v8.16b\n"
+    "zip2 v8.16b, v31.16b, v8.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s29, [x24, x27]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str s25, [x23, x27]\n"
+    "zip2 v25.16b, v15.16b, v18.16b\n"
+    "str s7, [x22, x27]\n"
+    "zip1 v15.16b, v15.16b, v18.16b\n"
+    "zip1 v7.16b, v17.16b, v8.16b\n"
+    "add x27, x27, #0x4\n"
+    "zip2 v8.16b, v17.16b, v8.16b\n"
+    "ldr q31, [%x[params], #0x140]\n"
+    "zip2 v29.16b, v26.16b, v19.16b\n"
+    "add %x[params], %x[params], #0x180\n"
+    "zip1 v26.16b, v26.16b, v19.16b\n"
+    "zip1 v28.16b, v4.16b, v10.16b\n"
+    "zip2 v10.16b, v4.16b, v10.16b\n"
+    "zip2 v24.16b, v22.16b, v2.16b\n"
+    "zip1 v22.16b, v22.16b, v2.16b\n"
+    "zip1 v21.16b, v3.16b, v5.16b\n"
+    "zip2 v5.16b, v3.16b, v5.16b\n"
+    "zip2 v18.16b, v27.16b, v23.16b\n"
+    "zip1 v27.16b, v27.16b, v23.16b\n"
+    "zip1 v17.16b, v30.16b, v9.16b\n"
+    "zip2 v9.16b, v30.16b, v9.16b\n"
+    "zip2 v23.16b, v26.16b, v28.16b\n"
+    "zip1 v26.16b, v26.16b, v28.16b\n"
+    "zip1 v3.16b, v29.16b, v10.16b\n"
+    "zip2 v10.16b, v29.16b, v10.16b\n"
+    "zip2 v19.16b, v22.16b, v21.16b\n"
+    "zip1 v22.16b, v22.16b, v21.16b\n"
+    "zip1 v0.16b, v24.16b, v5.16b\n"
+    "zip2 v5.16b, v24.16b, v5.16b\n"
+    "zip2 v24.16b, v27.16b, v17.16b\n"
+    "zip1 v27.16b, v27.16b, v17.16b\n"
+    "zip1 v2.16b, v18.16b, v9.16b\n"
+    "zip2 v9.16b, v18.16b, v9.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "bgt 1b\n"
+    "2:"  // Detached iteration
+    "movi v21.4s, #0x0\n"
+    ".inst 0x4e9a9595  // sdot v21.4s, v12.16b, v26.16b\n"
+    ".inst 0x4e8f943f  // sdot v31.4s, v1.16b, v15.16b\n"
+    "tst %x[n_channels], #0xf\n"
+    ".inst 0x4e969595  // sdot v21.4s, v12.16b, v22.16b\n"
+    ".inst 0x4e9a943d  // sdot v29.4s, v1.16b, v26.16b\n"
+    "movi v18.4s, #0x0\n"
+    "add x28, x28, #0x10\n"
+    ".inst 0x4e9a94df  // sdot v31.4s, v6.16b, v26.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "mov v17.16b, v21.16b\n .inst 0x4e9b9591  // sdot v17.4s, v12.16b, v27.16b\n"
+    ".inst 0x4e8f9595  // sdot v21.4s, v12.16b, v15.16b\n"
+    "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+    ".inst 0x4e9a9592  // sdot v18.4s, v12.16b, v26.16b\n"
+    ".inst 0x4e9694dd  // sdot v29.4s, v6.16b, v22.16b\n"
+    ".inst 0x4e96969f  // sdot v31.4s, v20.16b, v22.16b\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+    ".inst 0x4e8f943e  // sdot v30.4s, v1.16b, v15.16b\n"
+    ".inst 0x4e9a943c  // sdot v28.4s, v1.16b, v26.16b\n"
+    "mls v31.4s, v21.4s, v16.4s\n"
+    ".inst 0x4e969592  // sdot v18.4s, v12.16b, v22.16b\n"
+    ".inst 0x4e9b969d  // sdot v29.4s, v20.16b, v27.16b\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x4e9a94de  // sdot v30.4s, v6.16b, v26.16b\n"
+    "ldr q4, [%x[params], #0x10]\n"
+    ".inst 0x4e9694dc  // sdot v28.4s, v6.16b, v22.16b\n"
+    "mls v29.4s, v17.4s, v16.4s\n"
+    "mov v21.16b, v18.16b\n .inst 0x4e9b9595  // sdot v21.4s, v12.16b, v27.16b\n"
+    ".inst 0x4e8f9592  // sdot v18.4s, v12.16b, v15.16b\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x4e96969e  // sdot v30.4s, v20.16b, v22.16b\n"
+    ".inst 0x4e9b969c  // sdot v28.4s, v20.16b, v27.16b\n"
+    "mls v30.4s, v18.4s, v16.4s\n"
+    "mls v28.4s, v21.4s, v16.4s\n"
+    "and v27.16b, v31.16b, v4.16b\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+    "ldr q15, [%x[params], #0x60]\n"
+    "sqadd v31.4s, v31.4s, v27.4s\n"
+    "and v20.16b, v30.16b, v4.16b\n"
+    "and v18.16b, v29.16b, v4.16b\n"
+    "and v17.16b, v28.16b, v4.16b\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v4.4s\n"
+    "sqadd v30.4s, v30.4s, v20.4s\n"
+    "ldr q27, [%x[params], #0x40]\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "ldr q26, [%x[params], #0x50]\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "ldr q6, [%x[params], #0x30]\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v4.4s\n"
+    "srshl v29.4s, v29.4s, v4.4s\n"
+    "srshl v28.4s, v28.4s, v4.4s\n"
+    "ldr q4, [%x[params], #0x70]\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "movi v1.4s, #0x0\n"
+    ".inst 0x4e979581  // sdot v1.4s, v12.16b, v23.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s31, [x25, x27]\n"
+    "ldr q31, [%x[params], #0x20]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    ".inst 0x4e939581  // sdot v1.4s, v12.16b, v19.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s30, [x24, x27]\n"
+    "mov v22.16b, v1.16b\n .inst 0x4e989596  // sdot v22.4s, v12.16b, v24.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s29, [x23, x27]\n"
+    "mov v29.16b, v31.16b\n"
+    ".inst 0x4e999581  // sdot v1.4s, v12.16b, v25.16b\n"
+    "str s28, [x22, x27]\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    ".inst 0x4e9994df  // sdot v31.4s, v6.16b, v25.16b\n"
+    ".inst 0x4e9794d5  // sdot v21.4s, v6.16b, v23.16b\n"
+    ".inst 0x4e97977f  // sdot v31.4s, v27.16b, v23.16b\n"
+    "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x4e9994dd  // sdot v29.4s, v6.16b, v25.16b\n"
+    ".inst 0x4e9794d4  // sdot v20.4s, v6.16b, v23.16b\n"
+    ".inst 0x4e979592  // sdot v18.4s, v12.16b, v23.16b\n"
+    ".inst 0x4e939775  // sdot v21.4s, v27.16b, v19.16b\n"
+    ".inst 0x4e93975f  // sdot v31.4s, v26.16b, v19.16b\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+    ".inst 0x4e97977d  // sdot v29.4s, v27.16b, v23.16b\n"
+    ".inst 0x4e939774  // sdot v20.4s, v27.16b, v19.16b\n"
+    "mls v31.4s, v1.4s, v16.4s\n"
+    ".inst 0x4e939592  // sdot v18.4s, v12.16b, v19.16b\n"
+    ".inst 0x4e989755  // sdot v21.4s, v26.16b, v24.16b\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x4e93975d  // sdot v29.4s, v26.16b, v19.16b\n"
+    ".inst 0x4e989754  // sdot v20.4s, v26.16b, v24.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v15.4s\n"
+    "mov v17.16b, v18.16b\n .inst 0x4e989591  // sdot v17.4s, v12.16b, v24.16b\n"
+    ".inst 0x4e999592  // sdot v18.4s, v12.16b, v25.16b\n"
+    "mls v29.4s, v18.4s, v16.4s\n"
+    "mls v21.4s, v22.4s, v16.4s\n"
+    "mls v20.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v4.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v15.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v15.4s\n"
+    "ldr q27, [%x[params], #0xc0]\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v19.16b, v29.16b, v4.16b\n"
+    "and v18.16b, v21.16b, v4.16b\n"
+    "and v17.16b, v20.16b, v4.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v4.4s\n"
+    "sqadd v29.4s, v29.4s, v19.4s\n"
+    "ldr q26, [%x[params], #0xa0]\n"
+    "sqadd v21.4s, v21.4s, v18.4s\n"
+    "ldr q25, [%x[params], #0xb0]\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "ldr q24, [%x[params], #0x90]\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "srshl v29.4s, v29.4s, v4.4s\n"
+    "srshl v21.4s, v21.4s, v4.4s\n"
+    "srshl v20.4s, v20.4s, v4.4s\n"
+    "ldr q1, [%x[params], #0xd0]\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v21.4s, v21.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v13.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "movi v23.4s, #0x0\n"
+    ".inst 0x4e839597  // sdot v23.4s, v12.16b, v3.16b\n"
+    ".inst 0x4e809597  // sdot v23.4s, v12.16b, v0.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s31, [x25, x27]\n"
+    "ldr q31, [%x[params], #0x80]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "mov v22.16b, v23.16b\n .inst 0x4e829596  // sdot v22.4s, v12.16b, v2.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s29, [x24, x27]\n"
+    ".inst 0x4e879597  // sdot v23.4s, v12.16b, v7.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s21, [x23, x27]\n"
+    "mov v21.16b, v31.16b\n"
+    "str s20, [x22, x27]\n"
+    "mov v4.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    ".inst 0x4e87971f  // sdot v31.4s, v24.16b, v7.16b\n"
+    ".inst 0x4e839704  // sdot v4.4s, v24.16b, v3.16b\n"
+    ".inst 0x4e83975f  // sdot v31.4s, v26.16b, v3.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x4e879715  // sdot v21.4s, v24.16b, v7.16b\n"
+    ".inst 0x4e839714  // sdot v20.4s, v24.16b, v3.16b\n"
+    ".inst 0x4e839592  // sdot v18.4s, v12.16b, v3.16b\n"
+    ".inst 0x4e809744  // sdot v4.4s, v26.16b, v0.16b\n"
+    ".inst 0x4e80973f  // sdot v31.4s, v25.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x4e839755  // sdot v21.4s, v26.16b, v3.16b\n"
+    ".inst 0x4e809754  // sdot v20.4s, v26.16b, v0.16b\n"
+    "mls v31.4s, v23.4s, v16.4s\n"
+    ".inst 0x4e809592  // sdot v18.4s, v12.16b, v0.16b\n"
+    ".inst 0x4e829724  // sdot v4.4s, v25.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x4e809735  // sdot v21.4s, v25.16b, v0.16b\n"
+    ".inst 0x4e829734  // sdot v20.4s, v25.16b, v2.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v27.4s\n"
+    "mov v17.16b, v18.16b\n .inst 0x4e829591  // sdot v17.4s, v12.16b, v2.16b\n"
+    ".inst 0x4e879592  // sdot v18.4s, v12.16b, v7.16b\n"
+    "mls v21.4s, v18.4s, v16.4s\n"
+    "mls v4.4s, v22.4s, v16.4s\n"
+    "mls v20.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v1.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v27.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+    "ldr q30, [%x[params], #0x120]\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v19.16b, v21.16b, v1.16b\n"
+    "and v18.16b, v4.16b, v1.16b\n"
+    "and v17.16b, v20.16b, v1.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "sqadd v21.4s, v21.4s, v19.4s\n"
+    "ldr q29, [%x[params], #0x100]\n"
+    "sqadd v4.4s, v4.4s, v18.4s\n"
+    "ldr q28, [%x[params], #0x110]\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "ldr q27, [%x[params], #0xf0]\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "srshl v21.4s, v21.4s, v1.4s\n"
+    "srshl v4.4s, v4.4s, v1.4s\n"
+    "srshl v20.4s, v20.4s, v1.4s\n"
+    "ldr q26, [%x[params], #0x130]\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "add v4.4s, v4.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smax v21.4s, v21.4s, v13.4s\n"
+    "smax v4.4s, v4.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v13.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "smin v4.4s, v4.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "movi v25.4s, #0x0\n"
+    ".inst 0x4e8a9599  // sdot v25.4s, v12.16b, v10.16b\n"
+    ".inst 0x4e859599  // sdot v25.4s, v12.16b, v5.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s31, [x25, x27]\n"
+    "ldr q24, [%x[params], #0xe0]\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "mov v23.16b, v25.16b\n .inst 0x4e899597  // sdot v23.4s, v12.16b, v9.16b\n"
+    "add %x[params], %x[params], #0x140\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "str s21, [x24, x27]\n"
+    ".inst 0x4e889599  // sdot v25.4s, v12.16b, v8.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s4, [x23, x27]\n"
+    "mov v22.16b, v24.16b\n"
+    "str s20, [x22, x27]\n"
+    "mov v21.16b, v24.16b\n"
+    "mov v20.16b, v24.16b\n"
+    ".inst 0x4e889778  // sdot v24.4s, v27.16b, v8.16b\n"
+    ".inst 0x4e8a9775  // sdot v21.4s, v27.16b, v10.16b\n"
+    ".inst 0x4e8a97b8  // sdot v24.4s, v29.16b, v10.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x4e889776  // sdot v22.4s, v27.16b, v8.16b\n"
+    ".inst 0x4e8a9774  // sdot v20.4s, v27.16b, v10.16b\n"
+    ".inst 0x4e8a9592  // sdot v18.4s, v12.16b, v10.16b\n"
+    ".inst 0x4e8597b5  // sdot v21.4s, v29.16b, v5.16b\n"
+    ".inst 0x4e859798  // sdot v24.4s, v28.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x4e8a97b6  // sdot v22.4s, v29.16b, v10.16b\n"
+    ".inst 0x4e8597b4  // sdot v20.4s, v29.16b, v5.16b\n"
+    "mls v24.4s, v25.4s, v16.4s\n"
+    ".inst 0x4e859592  // sdot v18.4s, v12.16b, v5.16b\n"
+    ".inst 0x4e899795  // sdot v21.4s, v28.16b, v9.16b\n"
+    "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+    ".inst 0x4e859796  // sdot v22.4s, v28.16b, v5.16b\n"
+    ".inst 0x4e899794  // sdot v20.4s, v28.16b, v9.16b\n"
+    "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+    "mov v17.16b, v18.16b\n .inst 0x4e899591  // sdot v17.4s, v12.16b, v9.16b\n"
+    ".inst 0x4e889592  // sdot v18.4s, v12.16b, v8.16b\n"
+    "mls v22.4s, v18.4s, v16.4s\n"
+    "mls v21.4s, v23.4s, v16.4s\n"
+    "mls v20.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v24.16b, v26.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "and v19.16b, v22.16b, v26.16b\n"
+    "and v18.16b, v21.16b, v26.16b\n"
+    "and v17.16b, v20.16b, v26.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v22.4s, v22.4s, v19.4s\n"
+    "sqadd v21.4s, v21.4s, v18.4s\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "srshl v24.4s, v24.4s, v26.4s\n"
+    "srshl v22.4s, v22.4s, v26.4s\n"
+    "srshl v21.4s, v21.4s, v26.4s\n"
+    "srshl v20.4s, v20.4s, v26.4s\n"
+    "add v24.4s, v24.4s, v14.4s\n"
+    "add v22.4s, v22.4s, v14.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "smax v24.4s, v24.4s, v13.4s\n"
+    "smax v22.4s, v22.4s, v13.4s\n"
+    "smax v21.4s, v21.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v13.4s\n"
+    "smin v24.4s, v24.4s, v11.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x25, x27]\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s22, [x24, x27]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s21, [x23, x27]\n"
+    "str s20, [x22, x27]\n"
+    "add x27, x27, #0x4\n"
+    "beq 35f\n"
+    "3:"  // Oddments
+    "and x20, %x[n_channels], #0xf\n"
+    "add x15, x15, x28\n"
+    "add x14, x14, x28\n"
+    "add x13, x13, x28\n"
+    "add x12, x12, x28\n"
+    "add x10, x10, x28\n"
+    "add x9, x9, x28\n"
+    "add x26, x26, x28\n"
+    "add x21, x21, x28\n"
+    "tbz %x[n_channels], #3, 7f\n"
+    "ldr d15, [x15], #0x8\n"
+    "ldr d25, [x14], #0x8\n"
+    "ldr d7, [x13], #0x8\n"
+    "ldr d8, [x12], #0x8\n"
+    "ldr d26, [x10], #0x8\n"
+    "ldr d23, [x9], #0x8\n"
+    "ldr d3, [x26], #0x8\n"
+    "ldr d10, [x21], #0x8\n"
+    "tbz %x[n_channels], #2, 5f\n"
+    "ld1 { v15.s }[2], [x15], #0x4\n"
+    "ld1 { v25.s }[2], [x14], #0x4\n"
+    "ld1 { v7.s }[2], [x13], #0x4\n"
+    "ld1 { v8.s }[2], [x12], #0x4\n"
+    "ld1 { v26.s }[2], [x10], #0x4\n"
+    "ld1 { v23.s }[2], [x9], #0x4\n"
+    "ld1 { v3.s }[2], [x26], #0x4\n"
+    "ld1 { v10.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v15.h }[6], [x15], #0x2\n"
+    "ld1 { v25.h }[6], [x14], #0x2\n"
+    "ld1 { v7.h }[6], [x13], #0x2\n"
+    "ld1 { v8.h }[6], [x12], #0x2\n"
+    "ld1 { v26.h }[6], [x10], #0x2\n"
+    "ld1 { v23.h }[6], [x9], #0x2\n"
+    "ld1 { v3.h }[6], [x26], #0x2\n"
+    "ld1 { v10.h }[6], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[14], [x15], #0x1\n"
+    "ld1 { v25.b }[14], [x14], #0x1\n"
+    "ld1 { v7.b }[14], [x13], #0x1\n"
+    "ld1 { v8.b }[14], [x12], #0x1\n"
+    "ld1 { v26.b }[14], [x10], #0x1\n"
+    "ld1 { v23.b }[14], [x9], #0x1\n"
+    "ld1 { v3.b }[14], [x26], #0x1\n"
+    "ld1 { v10.b }[14], [x21], #0x1\n"
+    "b 11f\n"
+    "4:"  // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[12], [x15], #0x1\n"
+    "ld1 { v25.b }[12], [x14], #0x1\n"
+    "ld1 { v7.b }[12], [x13], #0x1\n"
+    "ld1 { v8.b }[12], [x12], #0x1\n"
+    "ld1 { v26.b }[12], [x10], #0x1\n"
+    "ld1 { v23.b }[12], [x9], #0x1\n"
+    "ld1 { v3.b }[12], [x26], #0x1\n"
+    "ld1 { v10.b }[12], [x21], #0x1\n"
+    "b 11f\n"
+    "5:"  // Oddments: Load (A): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v15.h }[4], [x15], #0x2\n"
+    "ld1 { v25.h }[4], [x14], #0x2\n"
+    "ld1 { v7.h }[4], [x13], #0x2\n"
+    "ld1 { v8.h }[4], [x12], #0x2\n"
+    "ld1 { v26.h }[4], [x10], #0x2\n"
+    "ld1 { v23.h }[4], [x9], #0x2\n"
+    "ld1 { v3.h }[4], [x26], #0x2\n"
+    "ld1 { v10.h }[4], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[10], [x15], #0x1\n"
+    "ld1 { v25.b }[10], [x14], #0x1\n"
+    "ld1 { v7.b }[10], [x13], #0x1\n"
+    "ld1 { v8.b }[10], [x12], #0x1\n"
+    "ld1 { v26.b }[10], [x10], #0x1\n"
+    "ld1 { v23.b }[10], [x9], #0x1\n"
+    "ld1 { v3.b }[10], [x26], #0x1\n"
+    "ld1 { v10.b }[10], [x21], #0x1\n"
+    "b 11f\n"
+    "6:"  // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[8], [x15], #0x1\n"
+    "ld1 { v25.b }[8], [x14], #0x1\n"
+    "ld1 { v7.b }[8], [x13], #0x1\n"
+    "ld1 { v8.b }[8], [x12], #0x1\n"
+    "ld1 { v26.b }[8], [x10], #0x1\n"
+    "ld1 { v23.b }[8], [x9], #0x1\n"
+    "ld1 { v3.b }[8], [x26], #0x1\n"
+    "ld1 { v10.b }[8], [x21], #0x1\n"
+    "b 11f\n"
+    "7:"  // Oddments: Load (A): Bit 3: Unset
+    "tbz %x[n_channels], #2, 9f\n"
+    "ldr s15, [x15], #0x4\n"
+    "ldr s25, [x14], #0x4\n"
+    "ldr s7, [x13], #0x4\n"
+    "ldr s8, [x12], #0x4\n"
+    "ldr s26, [x10], #0x4\n"
+    "ldr s23, [x9], #0x4\n"
+    "ldr s3, [x26], #0x4\n"
+    "ldr s10, [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v15.h }[2], [x15], #0x2\n"
+    "ld1 { v25.h }[2], [x14], #0x2\n"
+    "ld1 { v7.h }[2], [x13], #0x2\n"
+    "ld1 { v8.h }[2], [x12], #0x2\n"
+    "ld1 { v26.h }[2], [x10], #0x2\n"
+    "ld1 { v23.h }[2], [x9], #0x2\n"
+    "ld1 { v3.h }[2], [x26], #0x2\n"
+    "ld1 { v10.h }[2], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[6], [x15], #0x1\n"
+    "ld1 { v25.b }[6], [x14], #0x1\n"
+    "ld1 { v7.b }[6], [x13], #0x1\n"
+    "ld1 { v8.b }[6], [x12], #0x1\n"
+    "ld1 { v26.b }[6], [x10], #0x1\n"
+    "ld1 { v23.b }[6], [x9], #0x1\n"
+    "ld1 { v3.b }[6], [x26], #0x1\n"
+    "ld1 { v10.b }[6], [x21], #0x1\n"
+    "b 11f\n"
+    "8:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[4], [x15], #0x1\n"
+    "ld1 { v25.b }[4], [x14], #0x1\n"
+    "ld1 { v7.b }[4], [x13], #0x1\n"
+    "ld1 { v8.b }[4], [x12], #0x1\n"
+    "ld1 { v26.b }[4], [x10], #0x1\n"
+    "ld1 { v23.b }[4], [x9], #0x1\n"
+    "ld1 { v3.b }[4], [x26], #0x1\n"
+    "ld1 { v10.b }[4], [x21], #0x1\n"
+    "b 11f\n"
+    "9:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 10f\n"
+    "ldr h15, [x15], #0x2\n"
+    "ldr h25, [x14], #0x2\n"
+    "ldr h7, [x13], #0x2\n"
+    "ldr h8, [x12], #0x2\n"
+    "ldr h26, [x10], #0x2\n"
+    "ldr h23, [x9], #0x2\n"
+    "ldr h3, [x26], #0x2\n"
+    "ldr h10, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[2], [x15], #0x1\n"
+    "ld1 { v25.b }[2], [x14], #0x1\n"
+    "ld1 { v7.b }[2], [x13], #0x1\n"
+    "ld1 { v8.b }[2], [x12], #0x1\n"
+    "ld1 { v26.b }[2], [x10], #0x1\n"
+    "ld1 { v23.b }[2], [x9], #0x1\n"
+    "ld1 { v3.b }[2], [x26], #0x1\n"
+    "ld1 { v10.b }[2], [x21], #0x1\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "ldr b15, [x15], #0x1\n"
+    "ldr b25, [x14], #0x1\n"
+    "ldr b7, [x13], #0x1\n"
+    "ldr b8, [x12], #0x1\n"
+    "ldr b26, [x10], #0x1\n"
+    "ldr b23, [x9], #0x1\n"
+    "ldr b3, [x26], #0x1\n"
+    "ldr b10, [x21], #0x1\n"
+    "11:"  // Oddments: Load (A): Bit 3: End
+    "ldp x15, x14, [%x[inptrs], #0x40]\n"
+    "ldp x13, x12, [%x[inptrs], #0x50]\n"
+    "add x15, x15, x28\n"
+    "add x14, x14, x28\n"
+    "ldp x10, x9, [%x[inptrs], #0x60]\n"
+    "ldp x26, x21, [%x[inptrs], #0x70]\n"
+    "add x13, x13, x28\n"
+    "add x12, x12, x28\n"
+    "add x10, x10, x28\n"
+    "add x9, x9, x28\n"
+    "add x26, x26, x28\n"
+    "add x21, x21, x28\n"
+    "tbz %x[n_channels], #3, 15f\n"
+    "ldr d22, [x15], #0x8\n"
+    "ldr d19, [x14], #0x8\n"
+    "ldr d0, [x13], #0x8\n"
+    "ldr d5, [x12], #0x8\n"
+    "ldr d27, [x10], #0x8\n"
+    "ldr d24, [x9], #0x8\n"
+    "ldr d2, [x26], #0x8\n"
+    "ldr d9, [x21], #0x8\n"
+    "tbz %x[n_channels], #2, 13f\n"
+    "ld1 { v22.s }[2], [x15], #0x4\n"
+    "ld1 { v19.s }[2], [x14], #0x4\n"
+    "ld1 { v0.s }[2], [x13], #0x4\n"
+    "ld1 { v5.s }[2], [x12], #0x4\n"
+    "ld1 { v27.s }[2], [x10], #0x4\n"
+    "ld1 { v24.s }[2], [x9], #0x4\n"
+    "ld1 { v2.s }[2], [x26], #0x4\n"
+    "ld1 { v9.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v22.h }[6], [x15], #0x2\n"
+    "ld1 { v19.h }[6], [x14], #0x2\n"
+    "ld1 { v0.h }[6], [x13], #0x2\n"
+    "ld1 { v5.h }[6], [x12], #0x2\n"
+    "ld1 { v27.h }[6], [x10], #0x2\n"
+    "ld1 { v24.h }[6], [x9], #0x2\n"
+    "ld1 { v2.h }[6], [x26], #0x2\n"
+    "ld1 { v9.h }[6], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[14], [x15], #0x1\n"
+    "ld1 { v19.b }[14], [x14], #0x1\n"
+    "ld1 { v0.b }[14], [x13], #0x1\n"
+    "ld1 { v5.b }[14], [x12], #0x1\n"
+    "ld1 { v27.b }[14], [x10], #0x1\n"
+    "ld1 { v24.b }[14], [x9], #0x1\n"
+    "ld1 { v2.b }[14], [x26], #0x1\n"
+    "ld1 { v9.b }[14], [x21], #0x1\n"
+    "b 19f\n"
+    "12:"  // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[12], [x15], #0x1\n"
+    "ld1 { v19.b }[12], [x14], #0x1\n"
+    "ld1 { v0.b }[12], [x13], #0x1\n"
+    "ld1 { v5.b }[12], [x12], #0x1\n"
+    "ld1 { v27.b }[12], [x10], #0x1\n"
+    "ld1 { v24.b }[12], [x9], #0x1\n"
+    "ld1 { v2.b }[12], [x26], #0x1\n"
+    "ld1 { v9.b }[12], [x21], #0x1\n"
+    "b 19f\n"
+    "13:"  // Oddments: Load (B): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v22.h }[4], [x15], #0x2\n"
+    "ld1 { v19.h }[4], [x14], #0x2\n"
+    "ld1 { v0.h }[4], [x13], #0x2\n"
+    "ld1 { v5.h }[4], [x12], #0x2\n"
+    "ld1 { v27.h }[4], [x10], #0x2\n"
+    "ld1 { v24.h }[4], [x9], #0x2\n"
+    "ld1 { v2.h }[4], [x26], #0x2\n"
+    "ld1 { v9.h }[4], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[10], [x15], #0x1\n"
+    "ld1 { v19.b }[10], [x14], #0x1\n"
+    "ld1 { v0.b }[10], [x13], #0x1\n"
+    "ld1 { v5.b }[10], [x12], #0x1\n"
+    "ld1 { v27.b }[10], [x10], #0x1\n"
+    "ld1 { v24.b }[10], [x9], #0x1\n"
+    "ld1 { v2.b }[10], [x26], #0x1\n"
+    "ld1 { v9.b }[10], [x21], #0x1\n"
+    "b 19f\n"
+    "14:"  // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[8], [x15], #0x1\n"
+    "ld1 { v19.b }[8], [x14], #0x1\n"
+    "ld1 { v0.b }[8], [x13], #0x1\n"
+    "ld1 { v5.b }[8], [x12], #0x1\n"
+    "ld1 { v27.b }[8], [x10], #0x1\n"
+    "ld1 { v24.b }[8], [x9], #0x1\n"
+    "ld1 { v2.b }[8], [x26], #0x1\n"
+    "ld1 { v9.b }[8], [x21], #0x1\n"
+    "b 19f\n"
+    "15:"  // Oddments: Load (B): Bit 3: Unset
+    "tbz %x[n_channels], #2, 17f\n"
+    "ldr s22, [x15], #0x4\n"
+    "ldr s19, [x14], #0x4\n"
+    "ldr s0, [x13], #0x4\n"
+    "ldr s5, [x12], #0x4\n"
+    "ldr s27, [x10], #0x4\n"
+    "ldr s24, [x9], #0x4\n"
+    "ldr s2, [x26], #0x4\n"
+    "ldr s9, [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v22.h }[2], [x15], #0x2\n"
+    "ld1 { v19.h }[2], [x14], #0x2\n"
+    "ld1 { v0.h }[2], [x13], #0x2\n"
+    "ld1 { v5.h }[2], [x12], #0x2\n"
+    "ld1 { v27.h }[2], [x10], #0x2\n"
+    "ld1 { v24.h }[2], [x9], #0x2\n"
+    "ld1 { v2.h }[2], [x26], #0x2\n"
+    "ld1 { v9.h }[2], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[6], [x15], #0x1\n"
+    "ld1 { v19.b }[6], [x14], #0x1\n"
+    "ld1 { v0.b }[6], [x13], #0x1\n"
+    "ld1 { v5.b }[6], [x12], #0x1\n"
+    "ld1 { v27.b }[6], [x10], #0x1\n"
+    "ld1 { v24.b }[6], [x9], #0x1\n"
+    "ld1 { v2.b }[6], [x26], #0x1\n"
+    "ld1 { v9.b }[6], [x21], #0x1\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[4], [x15], #0x1\n"
+    "ld1 { v19.b }[4], [x14], #0x1\n"
+    "ld1 { v0.b }[4], [x13], #0x1\n"
+    "ld1 { v5.b }[4], [x12], #0x1\n"
+    "ld1 { v27.b }[4], [x10], #0x1\n"
+    "ld1 { v24.b }[4], [x9], #0x1\n"
+    "ld1 { v2.b }[4], [x26], #0x1\n"
+    "ld1 { v9.b }[4], [x21], #0x1\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ldr h22, [x15], #0x2\n"
+    "ldr h19, [x14], #0x2\n"
+    "ldr h0, [x13], #0x2\n"
+    "ldr h5, [x12], #0x2\n"
+    "ldr h27, [x10], #0x2\n"
+    "ldr h24, [x9], #0x2\n"
+    "ldr h2, [x26], #0x2\n"
+    "ldr h9, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[2], [x15], #0x1\n"
+    "ld1 { v19.b }[2], [x14], #0x1\n"
+    "ld1 { v0.b }[2], [x13], #0x1\n"
+    "ld1 { v5.b }[2], [x12], #0x1\n"
+    "ld1 { v27.b }[2], [x10], #0x1\n"
+    "ld1 { v24.b }[2], [x9], #0x1\n"
+    "ld1 { v2.b }[2], [x26], #0x1\n"
+    "ld1 { v9.b }[2], [x21], #0x1\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "ldr b22, [x15], #0x1\n"
+    "ldr b19, [x14], #0x1\n"
+    "ldr b0, [x13], #0x1\n"
+    "ldr b5, [x12], #0x1\n"
+    "ldr b27, [x10], #0x1\n"
+    "ldr b24, [x9], #0x1\n"
+    "ldr b2, [x26], #0x1\n"
+    "ldr b9, [x21], #0x1\n"
+    "19:"  // Oddments: Load (B): Bit 3: End
+    "ldr q20, [%x[params], #0x10]\n"
+    "ldr q6, [%x[params], #0x20]\n"
+    "zip2 v1.16b, v26.16b, v3.16b\n"
+    "zip1 v26.16b, v26.16b, v3.16b\n"
+    "ldr q4, [%x[params], #0x30]\n"
+    "zip1 v18.16b, v23.16b, v10.16b\n"
+    "zip2 v30.16b, v15.16b, v7.16b\n"
+    "cmp x20, #0x4\n"
+    "zip1 v15.16b, v15.16b, v7.16b\n"
+    "zip1 v29.16b, v25.16b, v8.16b\n"
+    "zip2 v8.16b, v25.16b, v8.16b\n"
+    "zip2 v10.16b, v23.16b, v10.16b\n"
+    "zip2 v23.16b, v26.16b, v18.16b\n"
+    "zip1 v26.16b, v26.16b, v18.16b\n"
+    "zip2 v28.16b, v22.16b, v0.16b\n"
+    "zip1 v22.16b, v22.16b, v0.16b\n"
+    "zip1 v21.16b, v19.16b, v5.16b\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x4e9a9591  // sdot v17.4s, v12.16b, v26.16b\n"
+    "zip2 v25.16b, v15.16b, v29.16b\n"
+    "zip1 v15.16b, v15.16b, v29.16b\n"
+    "zip1 v7.16b, v30.16b, v8.16b\n"
+    "zip2 v8.16b, v30.16b, v8.16b\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "zip2 v5.16b, v19.16b, v5.16b\n"
+    "zip2 v30.16b, v27.16b, v2.16b\n"
+    "zip1 v27.16b, v27.16b, v2.16b\n"
+    "zip1 v18.16b, v24.16b, v9.16b\n"
+    "zip2 v9.16b, v24.16b, v9.16b\n"
+    "zip2 v19.16b, v22.16b, v21.16b\n"
+    "zip1 v22.16b, v22.16b, v21.16b\n"
+    "zip1 v3.16b, v1.16b, v10.16b\n"
+    ".inst 0x4e969591  // sdot v17.4s, v12.16b, v22.16b\n"
+    "zip2 v10.16b, v1.16b, v10.16b\n"
+    "zip1 v0.16b, v28.16b, v5.16b\n"
+    "zip2 v5.16b, v28.16b, v5.16b\n"
+    "zip2 v24.16b, v27.16b, v18.16b\n"
+    "zip1 v27.16b, v27.16b, v18.16b\n"
+    "zip1 v2.16b, v30.16b, v9.16b\n"
+    "mov v18.16b, v17.16b\n .inst 0x4e9b9592  // sdot v18.4s, v12.16b, v27.16b\n"
+    "zip2 v9.16b, v30.16b, v9.16b\n"
+    "mov v30.16b, v31.16b\n"
+    ".inst 0x4e8f9591  // sdot v17.4s, v12.16b, v15.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    ".inst 0x4e8f969f  // sdot v31.4s, v20.16b, v15.16b\n"
+    ".inst 0x4e9a969d  // sdot v29.4s, v20.16b, v26.16b\n"
+    ".inst 0x4e9a94df  // sdot v31.4s, v6.16b, v26.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "movi v1.4s, #0x0\n"
+    "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+    ".inst 0x4e9a9581  // sdot v1.4s, v12.16b, v26.16b\n"
+    ".inst 0x4e9694dd  // sdot v29.4s, v6.16b, v22.16b\n"
+    ".inst 0x4e96949f  // sdot v31.4s, v4.16b, v22.16b\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+    ".inst 0x4e8f969e  // sdot v30.4s, v20.16b, v15.16b\n"
+    ".inst 0x4e9a969c  // sdot v28.4s, v20.16b, v26.16b\n"
+    "mls v31.4s, v17.4s, v16.4s\n"
+    ".inst 0x4e969581  // sdot v1.4s, v12.16b, v22.16b\n"
+    ".inst 0x4e9b949d  // sdot v29.4s, v4.16b, v27.16b\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x4e9a94de  // sdot v30.4s, v6.16b, v26.16b\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    ".inst 0x4e9694dc  // sdot v28.4s, v6.16b, v22.16b\n"
+    "mls v29.4s, v18.4s, v16.4s\n"
+    "mov v20.16b, v1.16b\n .inst 0x4e9b9594  // sdot v20.4s, v12.16b, v27.16b\n"
+    ".inst 0x4e8f9581  // sdot v1.4s, v12.16b, v15.16b\n"
+    "ldr q18, [%x[params], #0x40]\n"
+    "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+    ".inst 0x4e96949e  // sdot v30.4s, v4.16b, v22.16b\n"
+    ".inst 0x4e9b949c  // sdot v28.4s, v4.16b, v27.16b\n"
+    "mls v30.4s, v1.4s, v16.4s\n"
+    "add %x[params], %x[params], #0x60\n"
+    "mls v28.4s, v20.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v18.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v18.4s\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v17.16b, v30.16b, v21.16b\n"
+    "and v18.16b, v29.16b, v21.16b\n"
+    "and v26.16b, v28.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v28.4s, v28.4s, v26.4s\n"
+    "srshl v31.4s, v31.4s, v21.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v29.4s, v29.4s, v21.4s\n"
+    "srshl v28.4s, v28.4s, v21.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "blt 20f\n"
+    "str s31, [x25, x27]\n"
+    "str s30, [x24, x27]\n"
+    "str s29, [x23, x27]\n"
+    "str s28, [x22, x27]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Unroll 0: Oddment store
+    "add x25, x25, x27\n"
+    "add x24, x24, x27\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "tbz x20, #1, 21f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v29.h }[0], [x23], #0x2\n"
+    "st1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 22f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v29.b }[2], [x23], #0x1\n"
+    "st1 { v28.b }[2], [x22], #0x1\n"
+    "b 22f\n"
+    "21:"  // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v29.b }[0], [x23], #0x1\n"
+    "st1 { v28.b }[0], [x22], #0x1\n"
+    "22:"  // Oddments: Unroll 0: Oddment store: Bit 1: End
+    "23:"  // Oddments: Unroll 0: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x27, x27, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q27, [%x[params], #0x10]\n"
+    "movi v1.4s, #0x0\n"
+    ".inst 0x4e979581  // sdot v1.4s, v12.16b, v23.16b\n"
+    "ldr q26, [%x[params], #0x20]\n"
+    "ldr q22, [%x[params], #0x30]\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "ldr q4, [%x[params], #0x40]\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    "mov v28.16b, v31.16b\n"
+    ".inst 0x4e99977f  // sdot v31.4s, v27.16b, v25.16b\n"
+    ".inst 0x4e939581  // sdot v1.4s, v12.16b, v19.16b\n"
+    ".inst 0x4e97977d  // sdot v29.4s, v27.16b, v23.16b\n"
+    "movi v20.4s, #0x0\n"
+    "cmp x20, #0x4\n"
+    ".inst 0x4e97975f  // sdot v31.4s, v26.16b, v23.16b\n"
+    "mov v18.16b, v1.16b\n .inst 0x4e989592  // sdot v18.4s, v12.16b, v24.16b\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x4e999581  // sdot v1.4s, v12.16b, v25.16b\n"
+    "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+    ".inst 0x4e99977e  // sdot v30.4s, v27.16b, v25.16b\n"
+    ".inst 0x4e97977c  // sdot v28.4s, v27.16b, v23.16b\n"
+    ".inst 0x4e979594  // sdot v20.4s, v12.16b, v23.16b\n"
+    ".inst 0x4e93975d  // sdot v29.4s, v26.16b, v19.16b\n"
+    ".inst 0x4e9396df  // sdot v31.4s, v22.16b, v19.16b\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+    ".inst 0x4e97975e  // sdot v30.4s, v26.16b, v23.16b\n"
+    ".inst 0x4e93975c  // sdot v28.4s, v26.16b, v19.16b\n"
+    "mls v31.4s, v1.4s, v16.4s\n"
+    ".inst 0x4e939594  // sdot v20.4s, v12.16b, v19.16b\n"
+    ".inst 0x4e9896dd  // sdot v29.4s, v22.16b, v24.16b\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x4e9396de  // sdot v30.4s, v22.16b, v19.16b\n"
+    ".inst 0x4e9896dc  // sdot v28.4s, v22.16b, v24.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+    "mov v17.16b, v20.16b\n .inst 0x4e989591  // sdot v17.4s, v12.16b, v24.16b\n"
+    ".inst 0x4e999594  // sdot v20.4s, v12.16b, v25.16b\n"
+    "mls v30.4s, v20.4s, v16.4s\n"
+    "mls v29.4s, v18.4s, v16.4s\n"
+    "mls v28.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v19.16b, v30.16b, v21.16b\n"
+    "and v18.16b, v29.16b, v21.16b\n"
+    "and v17.16b, v28.16b, v21.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "srshl v31.4s, v31.4s, v21.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v29.4s, v29.4s, v21.4s\n"
+    "srshl v28.4s, v28.4s, v21.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "blt 24f\n"
+    "str s31, [x25, x27]\n"
+    "str s30, [x24, x27]\n"
+    "str s29, [x23, x27]\n"
+    "str s28, [x22, x27]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Unroll 1: Oddment store
+    "add x25, x25, x27\n"
+    "add x24, x24, x27\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "tbz x20, #1, 25f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v29.h }[0], [x23], #0x2\n"
+    "st1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 26f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v29.b }[2], [x23], #0x1\n"
+    "st1 { v28.b }[2], [x22], #0x1\n"
+    "b 26f\n"
+    "25:"  // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v29.b }[0], [x23], #0x1\n"
+    "st1 { v28.b }[0], [x22], #0x1\n"
+    "26:"  // Oddments: Unroll 1: Oddment store: Bit 1: End
+    "27:"  // Oddments: Unroll 1: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x27, x27, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q25, [%x[params], #0x10]\n"
+    "movi v24.4s, #0x0\n"
+    ".inst 0x4e839598  // sdot v24.4s, v12.16b, v3.16b\n"
+    "ldr q23, [%x[params], #0x20]\n"
+    "ldr q22, [%x[params], #0x30]\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "ldr q21, [%x[params], #0x40]\n"
+    "ldr q20, [%x[params], #0x50]\n"
+    "mov v28.16b, v31.16b\n"
+    ".inst 0x4e87973f  // sdot v31.4s, v25.16b, v7.16b\n"
+    ".inst 0x4e809598  // sdot v24.4s, v12.16b, v0.16b\n"
+    ".inst 0x4e83973d  // sdot v29.4s, v25.16b, v3.16b\n"
+    "movi v19.4s, #0x0\n"
+    "cmp x20, #0x4\n"
+    ".inst 0x4e8396ff  // sdot v31.4s, v23.16b, v3.16b\n"
+    "mov v18.16b, v24.16b\n .inst 0x4e829592  // sdot v18.4s, v12.16b, v2.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x4e879598  // sdot v24.4s, v12.16b, v7.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    ".inst 0x4e87973e  // sdot v30.4s, v25.16b, v7.16b\n"
+    ".inst 0x4e83973c  // sdot v28.4s, v25.16b, v3.16b\n"
+    ".inst 0x4e839593  // sdot v19.4s, v12.16b, v3.16b\n"
+    ".inst 0x4e8096fd  // sdot v29.4s, v23.16b, v0.16b\n"
+    ".inst 0x4e8096df  // sdot v31.4s, v22.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x4e8396fe  // sdot v30.4s, v23.16b, v3.16b\n"
+    ".inst 0x4e8096fc  // sdot v28.4s, v23.16b, v0.16b\n"
+    "mls v31.4s, v24.4s, v16.4s\n"
+    ".inst 0x4e809593  // sdot v19.4s, v12.16b, v0.16b\n"
+    ".inst 0x4e8296dd  // sdot v29.4s, v22.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x4e8096de  // sdot v30.4s, v22.16b, v0.16b\n"
+    ".inst 0x4e8296dc  // sdot v28.4s, v22.16b, v2.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+    "mov v17.16b, v19.16b\n .inst 0x4e829591  // sdot v17.4s, v12.16b, v2.16b\n"
+    ".inst 0x4e879593  // sdot v19.4s, v12.16b, v7.16b\n"
+    "mls v30.4s, v19.4s, v16.4s\n"
+    "mls v29.4s, v18.4s, v16.4s\n"
+    "mls v28.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v20.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v19.16b, v30.16b, v20.16b\n"
+    "and v18.16b, v29.16b, v20.16b\n"
+    "and v17.16b, v28.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "srshl v29.4s, v29.4s, v20.4s\n"
+    "srshl v28.4s, v28.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "blt 28f\n"
+    "str s31, [x25, x27]\n"
+    "str s30, [x24, x27]\n"
+    "str s29, [x23, x27]\n"
+    "str s28, [x22, x27]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Unroll 2: Oddment store
+    "add x25, x25, x27\n"
+    "add x24, x24, x27\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "tbz x20, #1, 29f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v29.h }[0], [x23], #0x2\n"
+    "st1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 30f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v29.b }[2], [x23], #0x1\n"
+    "st1 { v28.b }[2], [x22], #0x1\n"
+    "b 30f\n"
+    "29:"  // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v29.b }[0], [x23], #0x1\n"
+    "st1 { v28.b }[0], [x22], #0x1\n"
+    "30:"  // Oddments: Unroll 2: Oddment store: Bit 1: End
+    "31:"  // Oddments: Unroll 2: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x27, x27, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q23, [%x[params], #0x10]\n"
+    "movi v22.4s, #0x0\n"
+    ".inst 0x4e8a9596  // sdot v22.4s, v12.16b, v10.16b\n"
+    "ldr q21, [%x[params], #0x20]\n"
+    "ldr q19, [%x[params], #0x30]\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "ldr q20, [%x[params], #0x40]\n"
+    "ldr q26, [%x[params], #0x50]\n"
+    "mov v28.16b, v31.16b\n"
+    ".inst 0x4e8896ff  // sdot v31.4s, v23.16b, v8.16b\n"
+    ".inst 0x4e859596  // sdot v22.4s, v12.16b, v5.16b\n"
+    ".inst 0x4e8a96fd  // sdot v29.4s, v23.16b, v10.16b\n"
+    "movi v18.4s, #0x0\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x4e8a96bf  // sdot v31.4s, v21.16b, v10.16b\n"
+    "mov v17.16b, v22.16b\n .inst 0x4e899591  // sdot v17.4s, v12.16b, v9.16b\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    ".inst 0x4e889596  // sdot v22.4s, v12.16b, v8.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    ".inst 0x4e8896fe  // sdot v30.4s, v23.16b, v8.16b\n"
+    ".inst 0x4e8a96fc  // sdot v28.4s, v23.16b, v10.16b\n"
+    ".inst 0x4e8a9592  // sdot v18.4s, v12.16b, v10.16b\n"
+    ".inst 0x4e8596bd  // sdot v29.4s, v21.16b, v5.16b\n"
+    ".inst 0x4e85967f  // sdot v31.4s, v19.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x4e8a96be  // sdot v30.4s, v21.16b, v10.16b\n"
+    ".inst 0x4e8596bc  // sdot v28.4s, v21.16b, v5.16b\n"
+    "mls v31.4s, v22.4s, v16.4s\n"
+    ".inst 0x4e859592  // sdot v18.4s, v12.16b, v5.16b\n"
+    ".inst 0x4e89967d  // sdot v29.4s, v19.16b, v9.16b\n"
+    "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+    ".inst 0x4e85967e  // sdot v30.4s, v19.16b, v5.16b\n"
+    ".inst 0x4e89967c  // sdot v28.4s, v19.16b, v9.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v20.4s\n"
+    "mov v7.16b, v18.16b\n .inst 0x4e899587  // sdot v7.4s, v12.16b, v9.16b\n"
+    ".inst 0x4e889592  // sdot v18.4s, v12.16b, v8.16b\n"
+    "mls v30.4s, v18.4s, v16.4s\n"
+    "mls v29.4s, v17.4s, v16.4s\n"
+    "mls v28.4s, v7.4s, v16.4s\n"
+    "and v16.16b, v31.16b, v26.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v20.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v20.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "and v18.16b, v30.16b, v26.16b\n"
+    "and v17.16b, v29.16b, v26.16b\n"
+    "and v16.16b, v28.16b, v26.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "sqadd v29.4s, v29.4s, v17.4s\n"
+    "sqadd v28.4s, v28.4s, v16.4s\n"
+    "srshl v31.4s, v31.4s, v26.4s\n"
+    "srshl v30.4s, v30.4s, v26.4s\n"
+    "srshl v29.4s, v29.4s, v26.4s\n"
+    "srshl v28.4s, v28.4s, v26.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "32:"  // Oddments: Unroll 3: Oddment store
+    "add x25, x25, x27\n"
+    "add x24, x24, x27\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "tbz x20, #1, 33f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v29.h }[0], [x23], #0x2\n"
+    "st1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 34f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v29.b }[2], [x23], #0x1\n"
+    "st1 { v28.b }[2], [x22], #0x1\n"
+    "b 34f\n"
+    "33:"  // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v29.b }[0], [x23], #0x1\n"
+    "st1 { v28.b }[0], [x22], #0x1\n"
+    "34:"  // Oddments: Unroll 3: Oddment store: Bit 1: End
+    "35:"  // End
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..874b18c145
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const int8_t *const *const,
+  const int8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  int8_t *const *const
+);
+
+class a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..4626007afa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1166 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x8, x7, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v14.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v19.16b }, [x21]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v29.8h }, [x21]\n"
+    "ld1r { v12.8h }, [x20]\n"
+    "mov x17, #0x0\n"
+    "mov x16, #0x0\n"
+    "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x11, x10, [x22, #0x0]\n"
+    "ldp x9, x28, [x22, #0x10]\n"
+    "cbz x8, 3f\n"
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "subs x8, x8, #0x1\n"
+    "ssubl v23.8h, v23.8b, v19.8b\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "ssubl v16.8h, v16.8b, v19.8b\n"
+    "ssubl v1.8h, v1.8b, v19.8b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "ssubl v5.8h, v5.8b, v19.8b\n"
+    "ssubl v26.8h, v26.8b, v19.8b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "ssubl v18.8h, v18.8b, v19.8b\n"
+    "ssubl v31.8h, v31.8b, v19.8b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ssubl v25.8h, v25.8b, v19.8b\n"
+    "ssubl v20.8h, v20.8b, v19.8b\n"
+    "ldr q9, [x20, #0x0]\n"
+    "ldr q24, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x23, x22, [x15, #0x0]\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d22, [x23, x17]\n"
+    "ldr d4, [x22, x17]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d8, [x21, x17]\n"
+    "ldr d27, [x20, x17]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ldr d15, [x20, x17]\n"
+    "ssubl v22.8h, v22.8b, v14.8b\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "ssubl v8.8h, v8.8b, v14.8b\n"
+    "ssubl v27.8h, v27.8b, v14.8b\n"
+    "ssubl v15.8h, v15.8b, v14.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr q3, [x13, #0x0]\n"
+    "ldr q17, [x12, #0x0]\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr q21, [x13, #0x10]\n"
+    "ldr q28, [x12, #0x10]\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "ldr d11, [x20, x17]\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "ldr x20, [x15, #0x38]\n"
+    "ldr d4, [x20, x17]\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "ssubl v11.8h, v11.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "ldr x27, [x15, #0x48]\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "ssubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x26, [x15, #0x50]\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "ldr d8, [x20, x17]\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "ssubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "ldr x25, [x15, #0x58]\n"
+    "ldr x24, [x15, #0x60]\n"
+    "smlal v2.4s, v11.4h, v31.4h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x70]\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "smlal v9.4s, v4.4h, v16.4h\n"
+    "ldr x21, [x15, #0x78]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "ldr d27, [x27, x17]\n"
+    "smlal2 v30.4s, v11.8h, v31.8h\n"
+    "ldr d11, [x26, x17]\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "ssubl v27.8h, v27.8b, v14.8b\n"
+    "add x14, x14, #0x48\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal v10.4s, v22.4h, v20.4h\n"
+    "ssubl v11.8h, v11.8b, v14.8b\n"
+    "subs x8, x8, #0x1\n"
+    "smlal2 v24.4s, v4.8h, v16.8h\n"
+    "smlal v9.4s, v8.4h, v1.4h\n"
+    "add x13, x13, #0x20\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "ldr d15, [x25, x17]\n"
+    "ssubl v15.8h, v15.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v20.8h\n"
+    "ldr d22, [x24, x17]\n"
+    "smlal v7.4s, v4.4h, v23.4h\n"
+    "ssubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v2.4s, v27.4h, v18.4h\n"
+    "smlal v10.4s, v27.4h, v26.4h\n"
+    "smlal2 v24.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v20.4h\n"
+    "smlal2 v0.4s, v4.8h, v23.8h\n"
+    "ldr d4, [x23, x17]\n"
+    "smlal2 v30.4s, v27.8h, v18.8h\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v26.8h\n"
+    "ldr d26, [x22, x17]\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "ssubl v26.8h, v26.8b, v14.8b\n"
+    "smlal v2.4s, v11.4h, v23.4h\n"
+    "smlal v10.4s, v15.4h, v1.4h\n"
+    "smlal2 v24.4s, v27.8h, v20.8h\n"
+    "smlal v9.4s, v11.4h, v5.4h\n"
+    "smlal2 v0.4s, v8.8h, v16.8h\n"
+    "ldr d8, [x21, x17]\n"
+    "smlal2 v30.4s, v11.8h, v23.8h\n"
+    "ssubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v15.8h, v1.8h\n"
+    "smlal v7.4s, v27.4h, v25.4h\n"
+    "add x17, x17, #0x8\n"
+    "smlal v2.4s, v22.4h, v5.4h\n"
+    "smlal v10.4s, v4.4h, v18.4h\n"
+    "smlal2 v24.4s, v11.8h, v5.8h\n"
+    "smlal v9.4s, v22.4h, v31.4h\n"
+    "sqrdmulh v9.4s, v9.4s, v3.4s\n"
+    "smlal2 v0.4s, v27.8h, v25.8h\n"
+    "smlal2 v30.4s, v22.8h, v5.8h\n"
+    "and v27.16b, v9.16b, v17.16b\n"
+    "smlal2 v6.4s, v4.8h, v18.8h\n"
+    "smlal v7.4s, v15.4h, v18.4h\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "smlal v2.4s, v26.4h, v25.4h\n"
+    "smlal v10.4s, v26.4h, v31.4h\n"
+    "sqadd v9.4s, v9.4s, v27.4s\n"
+    "smlal2 v24.4s, v22.8h, v31.8h\n"
+    "smlal2 v0.4s, v15.8h, v18.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    "smlal2 v30.4s, v26.8h, v25.8h\n"
+    "smlal2 v6.4s, v26.8h, v31.8h\n"
+    "and v31.16b, v24.16b, v28.16b\n"
+    "smlal v7.4s, v4.4h, v20.4h\n"
+    "smlal v2.4s, v8.4h, v20.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v3.4s\n"
+    "smlal v10.4s, v8.4h, v25.4h\n"
+    "smlal2 v0.4s, v4.8h, v20.8h\n"
+    "sqrdmulh v2.4s, v2.4s, v3.4s\n"
+    "smlal2 v30.4s, v8.8h, v20.8h\n"
+    "smlal2 v6.4s, v8.8h, v25.8h\n"
+    "sqrdmulh v10.4s, v10.4s, v3.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v22.16b, v7.16b, v17.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+    "and v3.16b, v2.16b, v17.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "and v11.16b, v10.16b, v17.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v31.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v20.16b, v0.16b, v28.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "and v31.16b, v30.16b, v28.16b\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "and v18.16b, v6.16b, v28.16b\n"
+    "sqadd v7.4s, v7.4s, v22.4s\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v3.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v11.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v17.4s\n"
+    "srshl v7.4s, v7.4s, v17.4s\n"
+    "sqadd v0.4s, v0.4s, v20.4s\n"
+    "srshl v2.4s, v2.4s, v17.4s\n"
+    "sqadd v30.4s, v30.4s, v31.4s\n"
+    "srshl v10.4s, v10.4s, v17.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v28.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v28.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v28.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "str d9, [x11, x16]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "str d7, [x10, x16]\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "str d2, [x9, x16]\n"
+    "str d10, [x28, x16]\n"
+    "ldr q9, [x20, #0x0]\n"
+    "ldr q24, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "add x16, x16, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldp x23, x22, [x15, #0x0]\n"
+    "ssubl v23.8h, v23.8b, v19.8b\n"
+    "ssubl v16.8h, v16.8b, v19.8b\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr d22, [x23, x17]\n"
+    "ssubl v1.8h, v1.8b, v19.8b\n"
+    "ssubl v5.8h, v5.8b, v19.8b\n"
+    "ldr d4, [x22, x17]\n"
+    "ldr d8, [x21, x17]\n"
+    "ssubl v26.8h, v26.8b, v19.8b\n"
+    "ssubl v18.8h, v18.8b, v19.8b\n"
+    "ldr d27, [x20, x17]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ssubl v31.8h, v31.8b, v19.8b\n"
+    "ssubl v25.8h, v25.8b, v19.8b\n"
+    "ldr d15, [x20, x17]\n"
+    "ssubl v20.8h, v20.8b, v19.8b\n"
+    "ssubl v22.8h, v22.8b, v14.8b\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "ssubl v8.8h, v8.8b, v14.8b\n"
+    "ssubl v27.8h, v27.8b, v14.8b\n"
+    "ssubl v15.8h, v15.8b, v14.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q28, [x13, #0x0]\n"
+    "ldr q17, [x12, #0x0]\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr q21, [x13, #0x10]\n"
+    "ldr q3, [x12, #0x10]\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "ldr d11, [x20, x17]\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "ldr x20, [x15, #0x38]\n"
+    "ldr d4, [x20, x17]\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "ssubl v11.8h, v11.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "ldr x26, [x15, #0x48]\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "ssubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x25, [x15, #0x50]\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "ldr d8, [x20, x17]\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "ssubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "ldr x24, [x15, #0x58]\n"
+    "ldr x23, [x15, #0x60]\n"
+    "smlal v2.4s, v11.4h, v31.4h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "ldr x22, [x15, #0x68]\n"
+    "ldr x21, [x15, #0x70]\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "smlal v9.4s, v4.4h, v16.4h\n"
+    "ldr x20, [x15, #0x78]\n"
+    "tst x7, #0x7\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "ldr d27, [x26, x17]\n"
+    "smlal2 v30.4s, v11.8h, v31.8h\n"
+    "ldr d11, [x25, x17]\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "ssubl v27.8h, v27.8b, v14.8b\n"
+    "add x13, x13, #0x20\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal v10.4s, v22.4h, v20.4h\n"
+    "ssubl v11.8h, v11.8b, v14.8b\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v24.4s, v4.8h, v16.8h\n"
+    "smlal v9.4s, v8.4h, v1.4h\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "ldr d15, [x24, x17]\n"
+    "ssubl v15.8h, v15.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v20.8h\n"
+    "ldr d22, [x23, x17]\n"
+    "smlal v7.4s, v4.4h, v23.4h\n"
+    "ssubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v2.4s, v27.4h, v18.4h\n"
+    "smlal v10.4s, v27.4h, v26.4h\n"
+    "smlal2 v24.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v20.4h\n"
+    "smlal2 v0.4s, v4.8h, v23.8h\n"
+    "ldr d4, [x22, x17]\n"
+    "smlal2 v30.4s, v27.8h, v18.8h\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v26.8h\n"
+    "ldr d26, [x21, x17]\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "ssubl v26.8h, v26.8b, v14.8b\n"
+    "smlal v2.4s, v11.4h, v23.4h\n"
+    "smlal v10.4s, v15.4h, v1.4h\n"
+    "smlal2 v24.4s, v27.8h, v20.8h\n"
+    "smlal v9.4s, v11.4h, v5.4h\n"
+    "smlal2 v0.4s, v8.8h, v16.8h\n"
+    "ldr d16, [x20, x17]\n"
+    "smlal2 v30.4s, v11.8h, v23.8h\n"
+    "ssubl v16.8h, v16.8b, v14.8b\n"
+    "smlal2 v6.4s, v15.8h, v1.8h\n"
+    "smlal v7.4s, v27.4h, v25.4h\n"
+    "add x17, x17, #0x8\n"
+    "smlal v2.4s, v22.4h, v5.4h\n"
+    "smlal v10.4s, v4.4h, v18.4h\n"
+    "smlal2 v24.4s, v11.8h, v5.8h\n"
+    "smlal v9.4s, v22.4h, v31.4h\n"
+    "sqrdmulh v9.4s, v9.4s, v28.4s\n"
+    "smlal2 v0.4s, v27.8h, v25.8h\n"
+    "smlal2 v30.4s, v22.8h, v5.8h\n"
+    "and v1.16b, v9.16b, v17.16b\n"
+    "smlal2 v6.4s, v4.8h, v18.8h\n"
+    "smlal v7.4s, v15.4h, v18.4h\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "smlal v2.4s, v26.4h, v25.4h\n"
+    "smlal v10.4s, v26.4h, v31.4h\n"
+    "sqadd v9.4s, v9.4s, v1.4s\n"
+    "smlal2 v24.4s, v22.8h, v31.8h\n"
+    "smlal2 v0.4s, v15.8h, v18.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    "smlal2 v30.4s, v26.8h, v25.8h\n"
+    "smlal2 v6.4s, v26.8h, v31.8h\n"
+    "and v31.16b, v24.16b, v3.16b\n"
+    "smlal v7.4s, v4.4h, v20.4h\n"
+    "smlal v2.4s, v16.4h, v20.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v28.4s\n"
+    "smlal v10.4s, v16.4h, v25.4h\n"
+    "smlal2 v0.4s, v4.8h, v20.8h\n"
+    "sqrdmulh v2.4s, v2.4s, v28.4s\n"
+    "smlal2 v30.4s, v16.8h, v20.8h\n"
+    "smlal2 v6.4s, v16.8h, v25.8h\n"
+    "sqrdmulh v10.4s, v10.4s, v28.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v22.16b, v7.16b, v17.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+    "and v15.16b, v2.16b, v17.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "and v11.16b, v10.16b, v17.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v31.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v3.16b\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "and v23.16b, v30.16b, v3.16b\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "and v21.16b, v6.16b, v3.16b\n"
+    "sqadd v7.4s, v7.4s, v22.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v15.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v11.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v17.4s\n"
+    "srshl v7.4s, v7.4s, v17.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v2.4s, v2.4s, v17.4s\n"
+    "sqadd v30.4s, v30.4s, v23.4s\n"
+    "srshl v10.4s, v10.4s, v17.4s\n"
+    "sqadd v6.4s, v6.4s, v21.4s\n"
+    "srshl v24.4s, v24.4s, v3.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v3.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v3.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v3.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "str d9, [x11, x16]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "str d7, [x10, x16]\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "str d2, [x9, x16]\n"
+    "str d10, [x28, x16]\n"
+    "add x16, x16, #0x8\n"
+    "beq 64f\n"
+    "add x14, x14, #0x48\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x7, #2, 5f\n"
+    "ld1 { v9.4s }, [x20], #0x10\n"
+    "tbz x7, #1, 4f\n"
+    "ld1 { v24.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v24.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v24.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x7, #1, 6f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v9.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "ssubl v23.8h, v23.8b, v19.8b\n"
+    "ssubl v16.8h, v16.8b, v19.8b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "ssubl v1.8h, v1.8b, v19.8b\n"
+    "ssubl v5.8h, v5.8b, v19.8b\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ssubl v26.8h, v26.8b, v19.8b\n"
+    "ssubl v18.8h, v18.8b, v19.8b\n"
+    "ssubl v31.8h, v31.8b, v19.8b\n"
+    "ssubl v25.8h, v25.8b, v19.8b\n"
+    "ssubl v20.8h, v20.8b, v19.8b\n"
+    "add x24, x24, x17\n"
+    "add x23, x23, x17\n"
+    "add x22, x22, x17\n"
+    "add x21, x21, x17\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 9f\n"
+    "ld1 { v22.s }[0], [x24], #0x4\n"
+    "ld1 { v4.s }[0], [x23], #0x4\n"
+    "ld1 { v8.s }[0], [x22], #0x4\n"
+    "ld1 { v27.s }[0], [x21], #0x4\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 8f\n"
+    "ld1 { v22.h }[2], [x24], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v8.h }[2], [x22], #0x2\n"
+    "ld1 { v27.h }[2], [x21], #0x2\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[6], [x24]\n"
+    "ld1 { v4.b }[6], [x23]\n"
+    "ld1 { v8.b }[6], [x22]\n"
+    "ld1 { v27.b }[6], [x21]\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[4], [x24]\n"
+    "ld1 { v4.b }[4], [x23]\n"
+    "ld1 { v8.b }[4], [x22]\n"
+    "ld1 { v27.b }[4], [x21]\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x7, #1, 10f\n"
+    "ld1 { v22.h }[0], [x24], #0x2\n"
+    "ld1 { v4.h }[0], [x23], #0x2\n"
+    "ld1 { v8.h }[0], [x22], #0x2\n"
+    "ld1 { v27.h }[0], [x21], #0x2\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[2], [x24]\n"
+    "ld1 { v4.b }[2], [x23]\n"
+    "ld1 { v8.b }[2], [x22]\n"
+    "ld1 { v27.b }[2], [x21]\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[0], [x24]\n"
+    "ld1 { v4.b }[0], [x23]\n"
+    "ld1 { v8.b }[0], [x22]\n"
+    "ld1 { v27.b }[0], [x21]\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ssubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "ssubl v8.8h, v8.8b, v14.8b\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "ssubl v27.8h, v27.8b, v14.8b\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "ssubl v15.8h, v15.8b, v14.8b\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "tbz x7, #2, 13f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 12f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x7, #1, 14f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (3, 0): Bit 2: End
+    "ssubl v21.8h, v21.8b, v14.8b\n"
+    "smlal v2.4s, v21.4h, v31.4h\n"
+    "smlal2 v30.4s, v21.8h, v31.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "tbz x7, #2, 17f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 16f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x7, #1, 18f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (3, 3): Bit 2: End
+    "ssubl v28.8h, v28.8b, v14.8b\n"
+    "ldr x20, [x15, #0x38]\n"
+    "smlal v10.4s, v28.4h, v20.4h\n"
+    "smlal2 v6.4s, v28.8h, v20.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 21f\n"
+    "ld1 { v22.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 20f\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 1): Bit 2: Unset
+    "tbz x7, #1, 22f\n"
+    "ld1 { v22.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (0, 1): Bit 2: End
+    "ssubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v9.4s, v22.4h, v16.4h\n"
+    "smlal2 v24.4s, v22.8h, v16.8h\n"
+    "smlal v7.4s, v22.4h, v23.4h\n"
+    "smlal2 v0.4s, v22.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 25f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 24f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (0, 2): Bit 2: Unset
+    "tbz x7, #1, 26f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (0, 2): Bit 2: End
+    "ssubl v21.8h, v21.8b, v14.8b\n"
+    "ldr x20, [x15, #0x48]\n"
+    "smlal v9.4s, v21.4h, v1.4h\n"
+    "smlal2 v24.4s, v21.8h, v1.8h\n"
+    "smlal v7.4s, v21.4h, v16.4h\n"
+    "smlal2 v0.4s, v21.8h, v16.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 29f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 28f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x7, #1, 30f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "ssubl v28.8h, v28.8b, v14.8b\n"
+    "ldr x20, [x15, #0x50]\n"
+    "smlal v9.4s, v28.4h, v20.4h\n"
+    "smlal2 v24.4s, v28.8h, v20.8h\n"
+    "smlal v7.4s, v28.4h, v25.4h\n"
+    "smlal2 v0.4s, v28.8h, v25.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v2.4s, v28.4h, v18.4h\n"
+    "smlal2 v30.4s, v28.8h, v18.8h\n"
+    "smlal v10.4s, v28.4h, v26.4h\n"
+    "smlal2 v6.4s, v28.8h, v26.8h\n"
+    "tbz x7, #2, 33f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 32f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (1, 0): Bit 2: Unset
+    "tbz x7, #1, 34f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (1, 0): Bit 2: End
+    "ssubl v8.8h, v8.8b, v14.8b\n"
+    "ldr x20, [x15, #0x58]\n"
+    "smlal v9.4s, v8.4h, v5.4h\n"
+    "smlal2 v24.4s, v8.8h, v5.8h\n"
+    "smlal v2.4s, v8.4h, v23.4h\n"
+    "smlal2 v30.4s, v8.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 37f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 36f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x7, #1, 38f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (1, 3): Bit 2: End
+    "ssubl v8.8h, v8.8b, v14.8b\n"
+    "ldr x20, [x15, #0x60]\n"
+    "smlal v7.4s, v8.4h, v18.4h\n"
+    "smlal2 v0.4s, v8.8h, v18.8h\n"
+    "smlal v10.4s, v8.4h, v1.4h\n"
+    "smlal2 v6.4s, v8.8h, v1.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 41f\n"
+    "ld1 { v17.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 40f\n"
+    "ld1 { v17.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x7, #1, 42f\n"
+    "ld1 { v17.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (2, 0): Bit 2: End
+    "ssubl v17.8h, v17.8b, v14.8b\n"
+    "ldr x20, [x15, #0x68]\n"
+    "smlal v9.4s, v17.4h, v31.4h\n"
+    "smlal2 v24.4s, v17.8h, v31.8h\n"
+    "smlal v2.4s, v17.4h, v5.4h\n"
+    "smlal2 v30.4s, v17.8h, v5.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 45f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 44f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x7, #1, 46f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "ssubl v23.8h, v23.8b, v14.8b\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v7.4s, v23.4h, v20.4h\n"
+    "smlal2 v0.4s, v23.8h, v20.8h\n"
+    "smlal v10.4s, v23.4h, v18.4h\n"
+    "smlal2 v6.4s, v23.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 49f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 48f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x7, #1, 50f\n"
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "ssubl v5.8h, v5.8b, v14.8b\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v2.4s, v5.4h, v25.4h\n"
+    "smlal2 v30.4s, v5.8h, v25.8h\n"
+    "smlal v10.4s, v5.4h, v31.4h\n"
+    "smlal2 v6.4s, v5.8h, v31.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 53f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 52f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x7, #1, 54f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "ssubl v23.8h, v23.8b, v14.8b\n"
+    "smlal v2.4s, v23.4h, v20.4h\n"
+    "smlal2 v30.4s, v23.8h, v20.8h\n"
+    "smlal v10.4s, v23.4h, v25.4h\n"
+    "smlal2 v6.4s, v23.8h, v25.8h\n"
+    "tbz x7, #2, 57f\n"
+    "ld1 { v15.4s }, [x13], #0x10\n"
+    "ld1 { v19.4s }, [x12], #0x10\n"
+    "tbz x7, #1, 56f\n"
+    "ld1 { v18.d }[0], [x13], #0x8\n"
+    "ld1 { v22.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v18.s }[2], [x13]\n"
+    "ld1 { v22.s }[2], [x12]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v18.s }[0], [x13]\n"
+    "ld1 { v22.s }[0], [x12]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x7, #1, 58f\n"
+    "ld1 { v15.d }[0], [x13], #0x8\n"
+    "ld1 { v19.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v15.s }[2], [x13]\n"
+    "ld1 { v19.s }[2], [x12]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v15.s }[0], [x13]\n"
+    "ld1 { v19.s }[0], [x12]\n"
+    "59:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v9.4s, v9.4s, v15.4s\n"
+    "and v17.16b, v9.16b, v19.16b\n"
+    "add x11, x11, x16\n"
+    "add x10, x10, x16\n"
+    "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "add x9, x9, x16\n"
+    "add x28, x28, x16\n"
+    "and v20.16b, v24.16b, v22.16b\n"
+    "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+    "sqrdmulh v2.4s, v2.4s, v15.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v15.4s\n"
+    "sqadd v9.4s, v9.4s, v17.4s\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "and v21.16b, v7.16b, v19.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+    "and v15.16b, v2.16b, v19.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+    "and v23.16b, v10.16b, v19.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v20.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v22.16b\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "and v17.16b, v30.16b, v22.16b\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v28.16b, v6.16b, v22.16b\n"
+    "sqadd v7.4s, v7.4s, v21.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v15.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v23.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v19.4s\n"
+    "srshl v7.4s, v7.4s, v19.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v2.4s, v2.4s, v19.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "srshl v10.4s, v10.4s, v19.4s\n"
+    "sqadd v6.4s, v6.4s, v28.4s\n"
+    "srshl v24.4s, v24.4s, v22.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v22.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v22.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v22.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "tbz x7, #2, 61f\n"
+    "st1 { v9.s }[0], [x11], #0x4\n"
+    "st1 { v7.s }[0], [x10], #0x4\n"
+    "st1 { v2.s }[0], [x9], #0x4\n"
+    "st1 { v10.s }[0], [x28], #0x4\n"
+    "tbz x7, #1, 60f\n"
+    "st1 { v9.h }[2], [x11], #0x2\n"
+    "st1 { v7.h }[2], [x10], #0x2\n"
+    "st1 { v2.h }[2], [x9], #0x2\n"
+    "st1 { v10.h }[2], [x28], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[6], [x11], #0x1\n"
+    "st1 { v7.b }[6], [x10], #0x1\n"
+    "st1 { v2.b }[6], [x9], #0x1\n"
+    "st1 { v10.b }[6], [x28], #0x1\n"
+    "b 63f\n"
+    "60:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[4], [x11], #0x1\n"
+    "st1 { v7.b }[4], [x10], #0x1\n"
+    "st1 { v2.b }[4], [x9], #0x1\n"
+    "st1 { v10.b }[4], [x28], #0x1\n"
+    "b 63f\n"
+    "61:"  // Oddments: Bit 2: Unset
+    "tbz x7, #1, 62f\n"
+    "st1 { v9.h }[0], [x11], #0x2\n"
+    "st1 { v7.h }[0], [x10], #0x2\n"
+    "st1 { v2.h }[0], [x9], #0x2\n"
+    "st1 { v10.h }[0], [x28], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[2], [x11], #0x1\n"
+    "st1 { v7.b }[2], [x10], #0x1\n"
+    "st1 { v2.b }[2], [x9], #0x1\n"
+    "st1 { v10.b }[2], [x28], #0x1\n"
+    "b 63f\n"
+    "62:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[0], [x11], #0x1\n"
+    "st1 { v7.b }[0], [x10], #0x1\n"
+    "st1 { v2.b }[0], [x9], #0x1\n"
+    "st1 { v10.b }[0], [x28], #0x1\n"
+    "63:"  // Oddments: Bit 2: End
+    "64:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..893260362a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const int8_t *const *const,
+  const int8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  int8_t *const *const
+);
+
+
+class a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d98ab71cb8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x8, x7, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v6.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v15.16b }, [x21]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v17.8h }, [x21]\n"
+    "ld1r { v24.8h }, [x20]\n"
+    "mov x17, #0x0\n"
+    "mov x16, #0x0\n"
+    "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x11, x10, [x22, #0x0]\n"
+    "ldp x9, x28, [x22, #0x10]\n"
+    "cbz x8, 3f\n"
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "subs x8, x8, #0x1\n"
+    "ssubl v11.8h, v11.8b, v15.8b\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "ssubl v22.8h, v22.8b, v15.8b\n"
+    "ssubl v14.8h, v14.8b, v15.8b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "ssubl v28.8h, v28.8b, v15.8b\n"
+    "ssubl v18.8h, v18.8b, v15.8b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "ssubl v9.8h, v9.8b, v15.8b\n"
+    "ssubl v26.8h, v26.8b, v15.8b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ssubl v7.8h, v7.8b, v15.8b\n"
+    "ssubl v4.8h, v4.8b, v15.8b\n"
+    "ldr q5, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d25, [x27, x17]\n"
+    "ldr d27, [x26, x17]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d1, [x25, x17]\n"
+    "ldr d2, [x24, x17]\n"
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "ldr d12, [x23, x17]\n"
+    "ldr d16, [x22, x17]\n"
+    "ssubl v1.8h, v1.8b, v6.8b\n"
+    "ssubl v2.8h, v2.8b, v6.8b\n"
+    "ldr d23, [x21, x17]\n"
+    "ldr d10, [x20, x17]\n"
+    "ssubl v12.8h, v12.8b, v6.8b\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ssubl v23.8h, v23.8b, v6.8b\n"
+    "ssubl v10.8h, v10.8b, v6.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr q30, [x13, #0x0]\n"
+    "ldr q29, [x12, #0x0]\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x21, [x15, #0x58]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "ldr x25, [x15, #0x60]\n"
+    "ldr x24, [x15, #0x80]\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x88]\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "ldr d25, [x20, x17]\n"
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal v20.4s, v27.4h, v28.4h\n"
+    "smlal v19.4s, v25.4h, v18.4h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "ldr d1, [x25, x17]\n"
+    "ssubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "ldr d2, [x24, x17]\n"
+    "ssubl v2.8h, v2.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v28.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v18.8h\n"
+    "ldr d25, [x22, x17]\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "ldr x25, [x15, #0x98]\n"
+    "smlal v20.4s, v1.4h, v11.4h\n"
+    "smlal v19.4s, v2.4h, v22.4h\n"
+    "ldr x24, [x15, #0x50]\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "ldr d16, [x21, x17]\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "ldr d12, [x20, x17]\n"
+    "ldr x23, [x15, #0x48]\n"
+    "smlal2 v0.4s, v1.8h, v11.8h\n"
+    "smlal2 v31.4s, v2.8h, v22.8h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal v20.4s, v27.4h, v18.4h\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "smlal v19.4s, v25.4h, v9.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "ldr d23, [x25, x17]\n"
+    "ssubl v12.8h, v12.8b, v6.8b\n"
+    "ssubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "ldr d11, [x24, x17]\n"
+    "ssubl v11.8h, v11.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v18.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v9.8h\n"
+    "ldr d25, [x21, x17]\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "smlal v21.4s, v16.4h, v18.4h\n"
+    "smlal v20.4s, v12.4h, v22.4h\n"
+    "smlal v19.4s, v23.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "ldr d10, [x20, x17]\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "ssubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v5.4s, v11.4h, v9.4h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal2 v8.4s, v16.8h, v18.8h\n"
+    "ldr d18, [x22, x17]\n"
+    "ldr d16, [x21, x17]\n"
+    "smlal2 v0.4s, v12.8h, v22.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal2 v31.4s, v23.8h, v14.8h\n"
+    "ldr q14, [x13, #0x10]\n"
+    "smlal v21.4s, v27.4h, v9.4h\n"
+    "smlal v20.4s, v25.4h, v26.4h\n"
+    "smlal v19.4s, v10.4h, v28.4h\n"
+    "ssubl v18.8h, v18.8b, v6.8b\n"
+    "ldr x21, [x15, #0xc0]\n"
+    "smlal2 v3.4s, v11.8h, v9.8h\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v26.4h\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v8.4s, v27.8h, v9.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "smlal2 v0.4s, v25.8h, v26.8h\n"
+    "ldr q25, [x12, #0x10]\n"
+    "smlal2 v31.4s, v10.8h, v28.8h\n"
+    "smlal v21.4s, v11.4h, v28.4h\n"
+    "ssubl v22.8h, v22.8b, v6.8b\n"
+    "add x14, x14, #0x48\n"
+    "smlal v20.4s, v18.4h, v7.4h\n"
+    "smlal v19.4s, v16.4h, v7.4h\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "add x17, x17, #0x8\n"
+    "smlal2 v3.4s, v1.8h, v26.8h\n"
+    "smlal v5.4s, v12.4h, v7.4h\n"
+    "sqrdmulh v5.4s, v5.4s, v30.4s\n"
+    "subs x8, x8, #0x1\n"
+    "smlal2 v8.4s, v11.8h, v28.8h\n"
+    "smlal2 v0.4s, v18.8h, v7.8h\n"
+    "and v28.16b, v5.16b, v29.16b\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v31.4s, v16.8h, v7.8h\n"
+    "smlal v21.4s, v2.4h, v7.4h\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "add x12, x12, #0x20\n"
+    "smlal v20.4s, v10.4h, v9.4h\n"
+    "smlal v19.4s, v22.4h, v26.4h\n"
+    "sqadd v5.4s, v5.4s, v28.4s\n"
+    "smlal2 v3.4s, v12.8h, v7.8h\n"
+    "smlal2 v8.4s, v2.8h, v7.8h\n"
+    "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+    "smlal2 v0.4s, v10.8h, v9.8h\n"
+    "smlal2 v31.4s, v22.8h, v26.8h\n"
+    "and v16.16b, v3.16b, v25.16b\n"
+    "smlal v21.4s, v23.4h, v4.4h\n"
+    "smlal v20.4s, v22.4h, v4.4h\n"
+    "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+    "smlal v19.4s, v27.4h, v4.4h\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+    "smlal2 v0.4s, v22.8h, v4.8h\n"
+    "smlal2 v31.4s, v27.8h, v4.8h\n"
+    "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v12.16b, v21.16b, v29.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+    "and v23.16b, v20.16b, v29.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+    "and v9.16b, v19.16b, v29.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "and v18.16b, v8.16b, v25.16b\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v22.16b, v0.16b, v25.16b\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v25.16b\n"
+    "sqadd v21.4s, v21.4s, v12.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v23.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v9.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v29.4s\n"
+    "srshl v21.4s, v21.4s, v29.4s\n"
+    "sqadd v8.4s, v8.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v29.4s\n"
+    "sqadd v0.4s, v0.4s, v22.4s\n"
+    "srshl v19.4s, v19.4s, v29.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v25.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v25.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v25.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v25.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "str d5, [x11, x16]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d21, [x10, x16]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str d20, [x9, x16]\n"
+    "str d19, [x28, x16]\n"
+    "ldr q5, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "add x16, x16, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ssubl v11.8h, v11.8b, v15.8b\n"
+    "ssubl v22.8h, v22.8b, v15.8b\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ssubl v14.8h, v14.8b, v15.8b\n"
+    "ssubl v28.8h, v28.8b, v15.8b\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ldr d25, [x27, x17]\n"
+    "ssubl v18.8h, v18.8b, v15.8b\n"
+    "ssubl v9.8h, v9.8b, v15.8b\n"
+    "ldr d27, [x26, x17]\n"
+    "ldr d1, [x25, x17]\n"
+    "ssubl v26.8h, v26.8b, v15.8b\n"
+    "ssubl v7.8h, v7.8b, v15.8b\n"
+    "ldr d2, [x24, x17]\n"
+    "ldr d12, [x23, x17]\n"
+    "ssubl v4.8h, v4.8b, v15.8b\n"
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "ldr d16, [x22, x17]\n"
+    "ldr d23, [x21, x17]\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "ssubl v1.8h, v1.8b, v6.8b\n"
+    "ldr d10, [x20, x17]\n"
+    "ssubl v2.8h, v2.8b, v6.8b\n"
+    "ssubl v12.8h, v12.8b, v6.8b\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ssubl v23.8h, v23.8b, v6.8b\n"
+    "ssubl v10.8h, v10.8b, v6.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q29, [x13, #0x0]\n"
+    "ldr q30, [x12, #0x0]\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x21, [x15, #0x58]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "ldr x25, [x15, #0x60]\n"
+    "ldr x24, [x15, #0x80]\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x88]\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "ldr d25, [x20, x17]\n"
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal v20.4s, v27.4h, v28.4h\n"
+    "smlal v19.4s, v25.4h, v18.4h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "ldr d1, [x25, x17]\n"
+    "ssubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "ldr d2, [x24, x17]\n"
+    "ssubl v2.8h, v2.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v28.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v18.8h\n"
+    "ldr d25, [x22, x17]\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "ldr x25, [x15, #0x98]\n"
+    "smlal v20.4s, v1.4h, v11.4h\n"
+    "smlal v19.4s, v2.4h, v22.4h\n"
+    "ldr x24, [x15, #0x50]\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "ldr d16, [x21, x17]\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "ldr d12, [x20, x17]\n"
+    "ldr x23, [x15, #0x48]\n"
+    "smlal2 v0.4s, v1.8h, v11.8h\n"
+    "smlal2 v31.4s, v2.8h, v22.8h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal v20.4s, v27.4h, v18.4h\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "smlal v19.4s, v25.4h, v9.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "ldr d23, [x25, x17]\n"
+    "ssubl v12.8h, v12.8b, v6.8b\n"
+    "ssubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "ldr d11, [x24, x17]\n"
+    "ssubl v11.8h, v11.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v18.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v9.8h\n"
+    "ldr d25, [x21, x17]\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "smlal v21.4s, v16.4h, v18.4h\n"
+    "smlal v20.4s, v12.4h, v22.4h\n"
+    "smlal v19.4s, v23.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "ldr d10, [x20, x17]\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "ssubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v5.4s, v11.4h, v9.4h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal2 v8.4s, v16.8h, v18.8h\n"
+    "ldr d16, [x22, x17]\n"
+    "ldr d18, [x21, x17]\n"
+    "smlal2 v0.4s, v12.8h, v22.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal2 v31.4s, v23.8h, v14.8h\n"
+    "ldr q14, [x13, #0x10]\n"
+    "smlal v21.4s, v27.4h, v9.4h\n"
+    "smlal v20.4s, v25.4h, v26.4h\n"
+    "smlal v19.4s, v10.4h, v28.4h\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "smlal2 v3.4s, v11.8h, v9.8h\n"
+    "ssubl v18.8h, v18.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v26.4h\n"
+    "tst x7, #0x7\n"
+    "smlal2 v8.4s, v27.8h, v9.8h\n"
+    "ldr d27, [x20, x17]\n"
+    "smlal2 v0.4s, v25.8h, v26.8h\n"
+    "ldr q25, [x12, #0x10]\n"
+    "smlal2 v31.4s, v10.8h, v28.8h\n"
+    "smlal v21.4s, v11.4h, v28.4h\n"
+    "ssubl v22.8h, v22.8b, v6.8b\n"
+    "add x17, x17, #0x8\n"
+    "smlal v20.4s, v16.4h, v7.4h\n"
+    "smlal v19.4s, v18.4h, v7.4h\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v3.4s, v1.8h, v26.8h\n"
+    "smlal v5.4s, v12.4h, v7.4h\n"
+    "sqrdmulh v5.4s, v5.4s, v29.4s\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v8.4s, v11.8h, v28.8h\n"
+    "smlal2 v0.4s, v16.8h, v7.8h\n"
+    "and v16.16b, v5.16b, v30.16b\n"
+    "smlal2 v31.4s, v18.8h, v7.8h\n"
+    "smlal v21.4s, v2.4h, v7.4h\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smlal v20.4s, v10.4h, v9.4h\n"
+    "smlal v19.4s, v22.4h, v26.4h\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "smlal2 v3.4s, v12.8h, v7.8h\n"
+    "smlal2 v8.4s, v2.8h, v7.8h\n"
+    "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+    "smlal2 v0.4s, v10.8h, v9.8h\n"
+    "smlal2 v31.4s, v22.8h, v26.8h\n"
+    "and v16.16b, v3.16b, v25.16b\n"
+    "smlal v21.4s, v23.4h, v4.4h\n"
+    "smlal v20.4s, v22.4h, v4.4h\n"
+    "sqrdmulh v21.4s, v21.4s, v29.4s\n"
+    "smlal v19.4s, v27.4h, v4.4h\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "sqrdmulh v20.4s, v20.4s, v29.4s\n"
+    "smlal2 v0.4s, v22.8h, v4.8h\n"
+    "smlal2 v31.4s, v27.8h, v4.8h\n"
+    "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v23.16b, v21.16b, v30.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+    "and v27.16b, v20.16b, v30.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+    "and v22.16b, v19.16b, v30.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v14.16b, v8.16b, v25.16b\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v25.16b\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v25.16b\n"
+    "sqadd v21.4s, v21.4s, v23.4s\n"
+    "sshr v14.4s, v14.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v27.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v22.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v30.4s\n"
+    "srshl v21.4s, v21.4s, v30.4s\n"
+    "sqadd v8.4s, v8.4s, v14.4s\n"
+    "srshl v20.4s, v20.4s, v30.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v19.4s, v19.4s, v30.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v25.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v25.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v25.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v25.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "str d5, [x11, x16]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d21, [x10, x16]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str d20, [x9, x16]\n"
+    "str d19, [x28, x16]\n"
+    "add x16, x16, #0x8\n"
+    "beq 88f\n"
+    "add x14, x14, #0x48\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x7, #2, 5f\n"
+    "ld1 { v5.4s }, [x20], #0x10\n"
+    "tbz x7, #1, 4f\n"
+    "ld1 { v3.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v3.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v3.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x7, #1, 6f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v5.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v5.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "ssubl v11.8h, v11.8b, v15.8b\n"
+    "ssubl v22.8h, v22.8b, v15.8b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ssubl v14.8h, v14.8b, v15.8b\n"
+    "ssubl v28.8h, v28.8b, v15.8b\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ssubl v18.8h, v18.8b, v15.8b\n"
+    "ssubl v9.8h, v9.8b, v15.8b\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ssubl v26.8h, v26.8b, v15.8b\n"
+    "ssubl v7.8h, v7.8b, v15.8b\n"
+    "ssubl v4.8h, v4.8b, v15.8b\n"
+    "add x27, x27, x17\n"
+    "add x26, x26, x17\n"
+    "add x25, x25, x17\n"
+    "add x24, x24, x17\n"
+    "add x23, x23, x17\n"
+    "add x22, x22, x17\n"
+    "add x21, x21, x17\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 9f\n"
+    "ld1 { v25.s }[0], [x27], #0x4\n"
+    "ld1 { v27.s }[0], [x26], #0x4\n"
+    "ld1 { v1.s }[0], [x25], #0x4\n"
+    "ld1 { v2.s }[0], [x24], #0x4\n"
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "ld1 { v16.s }[0], [x22], #0x4\n"
+    "ld1 { v23.s }[0], [x21], #0x4\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 8f\n"
+    "ld1 { v25.h }[2], [x27], #0x2\n"
+    "ld1 { v27.h }[2], [x26], #0x2\n"
+    "ld1 { v1.h }[2], [x25], #0x2\n"
+    "ld1 { v2.h }[2], [x24], #0x2\n"
+    "ld1 { v12.h }[2], [x23], #0x2\n"
+    "ld1 { v16.h }[2], [x22], #0x2\n"
+    "ld1 { v23.h }[2], [x21], #0x2\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[6], [x27]\n"
+    "ld1 { v27.b }[6], [x26]\n"
+    "ld1 { v1.b }[6], [x25]\n"
+    "ld1 { v2.b }[6], [x24]\n"
+    "ld1 { v12.b }[6], [x23]\n"
+    "ld1 { v16.b }[6], [x22]\n"
+    "ld1 { v23.b }[6], [x21]\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[4], [x27]\n"
+    "ld1 { v27.b }[4], [x26]\n"
+    "ld1 { v1.b }[4], [x25]\n"
+    "ld1 { v2.b }[4], [x24]\n"
+    "ld1 { v12.b }[4], [x23]\n"
+    "ld1 { v16.b }[4], [x22]\n"
+    "ld1 { v23.b }[4], [x21]\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x7, #1, 10f\n"
+    "ld1 { v25.h }[0], [x27], #0x2\n"
+    "ld1 { v27.h }[0], [x26], #0x2\n"
+    "ld1 { v1.h }[0], [x25], #0x2\n"
+    "ld1 { v2.h }[0], [x24], #0x2\n"
+    "ld1 { v12.h }[0], [x23], #0x2\n"
+    "ld1 { v16.h }[0], [x22], #0x2\n"
+    "ld1 { v23.h }[0], [x21], #0x2\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[2], [x27]\n"
+    "ld1 { v27.b }[2], [x26]\n"
+    "ld1 { v1.b }[2], [x25]\n"
+    "ld1 { v2.b }[2], [x24]\n"
+    "ld1 { v12.b }[2], [x23]\n"
+    "ld1 { v16.b }[2], [x22]\n"
+    "ld1 { v23.b }[2], [x21]\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[0], [x27]\n"
+    "ld1 { v27.b }[0], [x26]\n"
+    "ld1 { v1.b }[0], [x25]\n"
+    "ld1 { v2.b }[0], [x24]\n"
+    "ld1 { v12.b }[0], [x23]\n"
+    "ld1 { v16.b }[0], [x22]\n"
+    "ld1 { v23.b }[0], [x21]\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "ssubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "ssubl v2.8h, v2.8b, v6.8b\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "ssubl v12.8h, v12.8b, v6.8b\n"
+    "ssubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "ssubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "tbz x7, #2, 13f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 12f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x7, #1, 14f\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "ssubl v15.8h, v15.8b, v6.8b\n"
+    "ldr x20, [x15, #0x48]\n"
+    "smlal v21.4s, v15.4h, v18.4h\n"
+    "smlal2 v8.4s, v15.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 17f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 16f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x7, #1, 18f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x50]\n"
+    "smlal v21.4s, v16.4h, v9.4h\n"
+    "smlal2 v8.4s, v16.8h, v9.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 21f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 20f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (1, 2): Bit 2: Unset
+    "tbz x7, #1, 22f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (1, 2): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x58]\n"
+    "smlal v5.4s, v16.4h, v9.4h\n"
+    "smlal2 v3.4s, v16.8h, v9.8h\n"
+    "smlal v21.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 25f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 24f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x7, #1, 26f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (3, 0): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x60]\n"
+    "smlal v20.4s, v16.4h, v28.4h\n"
+    "smlal2 v0.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 29f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 28f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x7, #1, 30f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 0): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x68]\n"
+    "smlal v5.4s, v16.4h, v26.4h\n"
+    "smlal2 v3.4s, v16.8h, v26.8h\n"
+    "smlal v20.4s, v16.4h, v11.4h\n"
+    "smlal2 v0.4s, v16.8h, v11.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 33f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 32f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x7, #1, 34f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (3, 1): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v20.4s, v16.4h, v18.4h\n"
+    "smlal2 v0.4s, v16.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 37f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 36f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x7, #1, 38f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 1): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v16.4h, v7.4h\n"
+    "smlal2 v3.4s, v16.8h, v7.8h\n"
+    "smlal v20.4s, v16.4h, v22.4h\n"
+    "smlal2 v0.4s, v16.8h, v22.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 41f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 40f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x7, #1, 42f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (3, 3): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x80]\n"
+    "smlal v19.4s, v16.4h, v18.4h\n"
+    "smlal2 v31.4s, v16.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 45f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 44f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x7, #1, 46f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x88]\n"
+    "smlal v21.4s, v16.4h, v7.4h\n"
+    "smlal2 v8.4s, v16.8h, v7.8h\n"
+    "smlal v19.4s, v16.4h, v22.4h\n"
+    "smlal2 v31.4s, v16.8h, v22.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 49f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 48f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x7, #1, 50f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 4): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x90]\n"
+    "smlal v19.4s, v16.4h, v9.4h\n"
+    "smlal2 v31.4s, v16.8h, v9.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 53f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 52f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x7, #1, 54f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (4, 0): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x98]\n"
+    "smlal v20.4s, v16.4h, v26.4h\n"
+    "smlal2 v0.4s, v16.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 57f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 56f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x7, #1, 58f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "59:"  // Oddments: Load (2, 4): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xa0]\n"
+    "smlal v21.4s, v16.4h, v4.4h\n"
+    "smlal2 v8.4s, v16.8h, v4.8h\n"
+    "smlal v19.4s, v16.4h, v14.4h\n"
+    "smlal2 v31.4s, v16.8h, v14.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 61f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 60f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x7, #1, 62f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "63:"  // Oddments: Load (4, 1): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v20.4s, v16.4h, v7.4h\n"
+    "smlal2 v0.4s, v16.8h, v7.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 65f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 64f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x7, #1, 66f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "67:"  // Oddments: Load (3, 2): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "smlal v20.4s, v16.4h, v9.4h\n"
+    "smlal2 v0.4s, v16.8h, v9.8h\n"
+    "smlal v19.4s, v16.4h, v28.4h\n"
+    "smlal2 v31.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 69f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 68f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x7, #1, 70f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "71:"  // Oddments: Load (4, 3): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal v19.4s, v16.4h, v7.4h\n"
+    "smlal2 v31.4s, v16.8h, v7.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 73f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 72f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x7, #1, 74f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 2): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "smlal v20.4s, v16.4h, v4.4h\n"
+    "smlal2 v0.4s, v16.8h, v4.8h\n"
+    "smlal v19.4s, v16.4h, v26.4h\n"
+    "smlal2 v31.4s, v16.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 77f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 76f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x7, #1, 78f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "79:"  // Oddments: Load (4, 4): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v19.4s, v16.4h, v4.4h\n"
+    "smlal2 v31.4s, v16.8h, v4.8h\n"
+    "tbz x7, #2, 81f\n"
+    "ld1 { v14.4s }, [x13], #0x10\n"
+    "ld1 { v25.4s }, [x12], #0x10\n"
+    "tbz x7, #1, 80f\n"
+    "ld1 { v18.d }[0], [x13], #0x8\n"
+    "ld1 { v12.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 83f\n"
+    "ld1 { v18.s }[2], [x13]\n"
+    "ld1 { v12.s }[2], [x12]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 83f\n"
+    "ld1 { v18.s }[0], [x13]\n"
+    "ld1 { v12.s }[0], [x12]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x7, #1, 82f\n"
+    "ld1 { v14.d }[0], [x13], #0x8\n"
+    "ld1 { v25.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 83f\n"
+    "ld1 { v14.s }[2], [x13]\n"
+    "ld1 { v25.s }[2], [x12]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 83f\n"
+    "ld1 { v14.s }[0], [x13]\n"
+    "ld1 { v25.s }[0], [x12]\n"
+    "83:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+    "and v28.16b, v5.16b, v25.16b\n"
+    "add x11, x11, x16\n"
+    "add x10, x10, x16\n"
+    "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "add x9, x9, x16\n"
+    "add x28, x28, x16\n"
+    "and v16.16b, v3.16b, v12.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v14.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v14.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+    "sqadd v5.4s, v5.4s, v28.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v14.16b, v21.16b, v25.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+    "and v6.16b, v20.16b, v25.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+    "and v4.16b, v19.16b, v25.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v14.4s, v14.4s, #0x1f\n"
+    "and v18.16b, v8.16b, v12.16b\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "and v7.16b, v0.16b, v12.16b\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v12.16b\n"
+    "sqadd v21.4s, v21.4s, v14.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v6.4s\n"
+    "sshr v7.4s, v7.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v4.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v25.4s\n"
+    "srshl v21.4s, v21.4s, v25.4s\n"
+    "sqadd v8.4s, v8.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v25.4s\n"
+    "sqadd v0.4s, v0.4s, v7.4s\n"
+    "srshl v19.4s, v19.4s, v25.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v12.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v12.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v12.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v12.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "tbz x7, #2, 85f\n"
+    "st1 { v5.s }[0], [x11], #0x4\n"
+    "st1 { v21.s }[0], [x10], #0x4\n"
+    "st1 { v20.s }[0], [x9], #0x4\n"
+    "st1 { v19.s }[0], [x28], #0x4\n"
+    "tbz x7, #1, 84f\n"
+    "st1 { v5.h }[2], [x11], #0x2\n"
+    "st1 { v21.h }[2], [x10], #0x2\n"
+    "st1 { v20.h }[2], [x9], #0x2\n"
+    "st1 { v19.h }[2], [x28], #0x2\n"
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[6], [x11], #0x1\n"
+    "st1 { v21.b }[6], [x10], #0x1\n"
+    "st1 { v20.b }[6], [x9], #0x1\n"
+    "st1 { v19.b }[6], [x28], #0x1\n"
+    "b 87f\n"
+    "84:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[4], [x11], #0x1\n"
+    "st1 { v21.b }[4], [x10], #0x1\n"
+    "st1 { v20.b }[4], [x9], #0x1\n"
+    "st1 { v19.b }[4], [x28], #0x1\n"
+    "b 87f\n"
+    "85:"  // Oddments: Bit 2: Unset
+    "tbz x7, #1, 86f\n"
+    "st1 { v5.h }[0], [x11], #0x2\n"
+    "st1 { v21.h }[0], [x10], #0x2\n"
+    "st1 { v20.h }[0], [x9], #0x2\n"
+    "st1 { v19.h }[0], [x28], #0x2\n"
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[2], [x11], #0x1\n"
+    "st1 { v21.b }[2], [x10], #0x1\n"
+    "st1 { v20.b }[2], [x9], #0x1\n"
+    "st1 { v19.b }[2], [x28], #0x1\n"
+    "b 87f\n"
+    "86:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[0], [x11], #0x1\n"
+    "st1 { v21.b }[0], [x10], #0x1\n"
+    "st1 { v20.b }[0], [x9], #0x1\n"
+    "st1 { v19.b }[0], [x28], #0x1\n"
+    "87:"  // Oddments: Bit 2: End
+    "88:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..ccab35ce57
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const int8_t *const *const,
+  const int8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  int8_t *const *const
+);
+
+
+
+class a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..b1648bae14
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,2187 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x2, x1, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v18.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v13.16b }, [x21]\n"
+    "ld1r { v26.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v11.8h }, [x21]\n"
+    "ld1r { v0.8h }, [x20]\n"
+    "mov x3, #0x0\n"
+    "mov x4, #0x0\n"
+    "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x17, x16, [x22, #0x0]\n"
+    "ldp x15, x14, [x22, #0x10]\n"
+    "cbz x2, 3f\n"
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "subs x2, x2, #0x1\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "ssubl v14.8h, v14.8b, v13.8b\n"
+    "ssubl v10.8h, v10.8b, v13.8b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "ldr q7, [x20, #0x0]\n"
+    "ldr q15, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "ldr d31, [x9, x3]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldr d17, [x28, x3]\n"
+    "ldr d30, [x27, x3]\n"
+    "ssubl v31.8h, v31.8b, v18.8b\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "ldr d16, [x26, x3]\n"
+    "ldr d3, [x25, x3]\n"
+    "ssubl v30.8h, v30.8b, v18.8b\n"
+    "ssubl v16.8h, v16.8b, v18.8b\n"
+    "ldr d4, [x24, x3]\n"
+    "ldr d25, [x23, x3]\n"
+    "ssubl v3.8h, v3.8b, v18.8b\n"
+    "ssubl v4.8h, v4.8b, v18.8b\n"
+    "ldr d9, [x22, x3]\n"
+    "ldr d29, [x21, x3]\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "ldr d28, [x20, x3]\n"
+    "ssubl v29.8h, v29.8b, v18.8b\n"
+    "ssubl v28.8h, v28.8b, v18.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr d2, [x6, #0x28]\n"
+    "ldr d27, [x6, #0x30]\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "ldr d1, [x6, #0x38]\n"
+    "ldr d31, [x6, #0x40]\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "ldr d8, [x6, #0x48]\n"
+    "ldr x22, [x5, #0x50]\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "ldr x20, [x5, #0x58]\n"
+    "ldr x21, [x5, #0x60]\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "ssubl v6.8h, v6.8b, v18.8b\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "ldr x22, [x5, #0x70]\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "ssubl v3.8h, v3.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "ldr d14, [x20, x3]\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "ssubl v14.8h, v14.8b, v18.8b\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal v23.4s, v17.4h, v10.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x20, [x5, #0x78]\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x21, [x5, #0x80]\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "ldr d25, [x22, x3]\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v10.8h\n"
+    "ldr d10, [x20, x3]\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "ssubl v10.8h, v10.8b, v18.8b\n"
+    "smlal v24.4s, v17.4h, v21.4h\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x24, [x5, #0x88]\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "smlal v7.4s, v30.4h, v2.4h\n"
+    "ldr x20, [x5, #0x90]\n"
+    "ldr x23, [x5, #0x98]\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "ldr d9, [x21, x3]\n"
+    "smlal2 v22.4s, v17.8h, v21.8h\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "ldr d21, [x6, #0x50]\n"
+    "smlal v20.4s, v3.4h, v12.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "ldr x22, [x5, #0xa0]\n"
+    "ldr x21, [x5, #0xa8]\n"
+    "smlal2 v15.4s, v30.8h, v2.8h\n"
+    "ldr d30, [x24, x3]\n"
+    "smlal v7.4s, v16.4h, v27.4h\n"
+    "ssubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v12.8h\n"
+    "ldr d3, [x6, #0x58]\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "ldr d12, [x20, x3]\n"
+    "smlal v20.4s, v16.4h, v2.4h\n"
+    "ssubl v12.8h, v12.8b, v18.8b\n"
+    "smlal v24.4s, v28.4h, v2.4h\n"
+    "smlal v23.4s, v14.4h, v2.4h\n"
+    "ldr x20, [x5, #0xb0]\n"
+    "ldr x13, [x5, #0xb8]\n"
+    "smlal2 v15.4s, v16.8h, v27.8h\n"
+    "smlal v7.4s, v4.4h, v1.4h\n"
+    "ldr x12, [x5, #0xc0]\n"
+    "ldr x11, [x5, #0xc8]\n"
+    "smlal2 v5.4s, v16.8h, v2.8h\n"
+    "ldr d16, [x23, x3]\n"
+    "smlal2 v22.4s, v28.8h, v2.8h\n"
+    "ssubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v2.8h\n"
+    "ldr d2, [x6, #0x60]\n"
+    "smlal v20.4s, v4.4h, v27.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v27.4h\n"
+    "smlal v23.4s, v25.4h, v27.4h\n"
+    "ldr x10, [x5, #0xd0]\n"
+    "ldr x9, [x5, #0xd8]\n"
+    "smlal2 v15.4s, v4.8h, v1.8h\n"
+    "smlal v7.4s, v17.4h, v31.4h\n"
+    "ldr x28, [x5, #0xe0]\n"
+    "ldr x27, [x5, #0xe8]\n"
+    "smlal2 v5.4s, v4.8h, v27.8h\n"
+    "ldr d4, [x22, x3]\n"
+    "smlal2 v22.4s, v14.8h, v27.8h\n"
+    "ssubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v27.8h\n"
+    "ldr d27, [x6, #0x68]\n"
+    "smlal v20.4s, v17.4h, v1.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v1.4h\n"
+    "smlal v23.4s, v10.4h, v1.4h\n"
+    "ldr x26, [x5, #0xf0]\n"
+    "ldr x25, [x5, #0xf8]\n"
+    "smlal2 v15.4s, v17.8h, v31.8h\n"
+    "smlal v7.4s, v6.4h, v8.4h\n"
+    "ldr x24, [x5, #0x100]\n"
+    "ldr x23, [x5, #0x108]\n"
+    "smlal2 v5.4s, v17.8h, v1.8h\n"
+    "ldr d17, [x21, x3]\n"
+    "smlal2 v22.4s, v25.8h, v1.8h\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v1.8h\n"
+    "ldr d1, [x6, #0x70]\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v31.4h\n"
+    "smlal v23.4s, v9.4h, v31.4h\n"
+    "ldr x22, [x5, #0x110]\n"
+    "ldr x21, [x5, #0x118]\n"
+    "smlal2 v15.4s, v6.8h, v8.8h\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "subs x2, x2, #0x1\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal2 v22.4s, v10.8h, v31.8h\n"
+    "ssubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v31.8h\n"
+    "ldr d31, [x6, #0x78]\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v9.4h, v8.4h\n"
+    "smlal v23.4s, v30.4h, v8.4h\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "ldr d28, [x13, x3]\n"
+    "smlal v7.4s, v14.4h, v3.4h\n"
+    "ssubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v5.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x6, #0x80]\n"
+    "smlal2 v22.4s, v9.8h, v8.8h\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "ldr d8, [x12, x3]\n"
+    "smlal v20.4s, v14.4h, v21.4h\n"
+    "ssubl v8.8h, v8.8b, v18.8b\n"
+    "smlal v24.4s, v12.4h, v21.4h\n"
+    "smlal v23.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v14.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v14.8h, v21.8h\n"
+    "ldr d14, [x11, x3]\n"
+    "smlal2 v22.4s, v12.8h, v21.8h\n"
+    "ssubl v14.8h, v14.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v21.8h\n"
+    "ldr d21, [x6, #0x88]\n"
+    "smlal v20.4s, v25.4h, v3.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v16.4h, v3.4h\n"
+    "smlal v23.4s, v4.4h, v3.4h\n"
+    "smlal2 v15.4s, v25.8h, v2.8h\n"
+    "smlal v7.4s, v10.4h, v27.4h\n"
+    "smlal2 v5.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x10, x3]\n"
+    "smlal2 v22.4s, v16.8h, v3.8h\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v3.8h\n"
+    "ldr d3, [x6, #0x90]\n"
+    "smlal v20.4s, v10.4h, v2.4h\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v4.4h, v2.4h\n"
+    "smlal v23.4s, v17.4h, v2.4h\n"
+    "smlal2 v15.4s, v10.8h, v27.8h\n"
+    "smlal v7.4s, v9.4h, v1.4h\n"
+    "smlal2 v5.4s, v10.8h, v2.8h\n"
+    "ldr d10, [x9, x3]\n"
+    "smlal2 v22.4s, v4.8h, v2.8h\n"
+    "ssubl v10.8h, v10.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v2.8h\n"
+    "ldr d2, [x6, #0x98]\n"
+    "smlal v20.4s, v9.4h, v27.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v17.4h, v27.4h\n"
+    "smlal v23.4s, v6.4h, v27.4h\n"
+    "smlal2 v15.4s, v9.8h, v1.8h\n"
+    "smlal v7.4s, v12.4h, v31.4h\n"
+    "smlal2 v5.4s, v9.8h, v27.8h\n"
+    "ldr d9, [x28, x3]\n"
+    "smlal2 v22.4s, v17.8h, v27.8h\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v27.8h\n"
+    "ldr d27, [x6, #0xa0]\n"
+    "smlal v20.4s, v30.4h, v1.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v1.4h\n"
+    "smlal v23.4s, v28.4h, v1.4h\n"
+    "smlal2 v15.4s, v12.8h, v31.8h\n"
+    "ldr d12, [x27, x3]\n"
+    "smlal v7.4s, v16.4h, v29.4h\n"
+    "ssubl v12.8h, v12.8b, v18.8b\n"
+    "smlal2 v5.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x6, #0xa8]\n"
+    "smlal2 v22.4s, v6.8h, v1.8h\n"
+    "ssubl v30.8h, v30.8b, v13.8b\n"
+    "smlal2 v19.4s, v28.8h, v1.8h\n"
+    "ldr d1, [x26, x3]\n"
+    "smlal v20.4s, v16.4h, v31.4h\n"
+    "ssubl v1.8h, v1.8b, v18.8b\n"
+    "smlal v24.4s, v8.4h, v31.4h\n"
+    "smlal v23.4s, v14.4h, v31.4h\n"
+    "smlal2 v15.4s, v16.8h, v29.8h\n"
+    "smlal v7.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v16.8h, v31.8h\n"
+    "ldr d16, [x25, x3]\n"
+    "smlal2 v22.4s, v8.8h, v31.8h\n"
+    "ssubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v31.8h\n"
+    "ldr d31, [x6, #0xb0]\n"
+    "smlal v20.4s, v4.4h, v29.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v29.4h\n"
+    "smlal v23.4s, v25.4h, v29.4h\n"
+    "smlal2 v15.4s, v4.8h, v21.8h\n"
+    "smlal v7.4s, v17.4h, v3.4h\n"
+    "smlal2 v5.4s, v4.8h, v29.8h\n"
+    "ldr d4, [x24, x3]\n"
+    "smlal2 v22.4s, v14.8h, v29.8h\n"
+    "ssubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v29.8h\n"
+    "ldr d29, [x6, #0xb8]\n"
+    "smlal v20.4s, v17.4h, v21.4h\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v15.4s, v17.8h, v3.8h\n"
+    "smlal v7.4s, v6.4h, v2.4h\n"
+    "smlal2 v5.4s, v17.8h, v21.8h\n"
+    "ldr d17, [x23, x3]\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "ldr d21, [x6, #0xc0]\n"
+    "smlal v20.4s, v6.4h, v3.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v3.4h\n"
+    "smlal v23.4s, v9.4h, v3.4h\n"
+    "add x6, x6, #0xc8\n"
+    "smlal2 v15.4s, v6.8h, v2.8h\n"
+    "smlal v7.4s, v8.4h, v27.4h\n"
+    "smlal2 v5.4s, v6.8h, v3.8h\n"
+    "ldr d6, [x22, x3]\n"
+    "smlal2 v22.4s, v10.8h, v3.8h\n"
+    "ssubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v3.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal v20.4s, v28.4h, v2.4h\n"
+    "ssubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v24.4s, v9.4h, v2.4h\n"
+    "smlal v23.4s, v12.4h, v2.4h\n"
+    "add x3, x3, #0x8\n"
+    "smlal2 v15.4s, v8.8h, v27.8h\n"
+    "ldr q8, [x7, #0x0]\n"
+    "smlal v7.4s, v14.4h, v30.4h\n"
+    "smlal2 v5.4s, v28.8h, v2.8h\n"
+    "ldr q28, [x8, #0x0]\n"
+    "smlal2 v22.4s, v9.8h, v2.8h\n"
+    "smlal2 v19.4s, v12.8h, v2.8h\n"
+    "ldr q2, [x7, #0x10]\n"
+    "smlal v20.4s, v14.4h, v27.4h\n"
+    "add x7, x7, #0x20\n"
+    "smlal v24.4s, v1.4h, v27.4h\n"
+    "smlal v23.4s, v16.4h, v27.4h\n"
+    "smlal2 v15.4s, v14.8h, v30.8h\n"
+    "smlal v7.4s, v25.4h, v31.4h\n"
+    "smlal2 v5.4s, v14.8h, v27.8h\n"
+    "ldr q14, [x8, #0x10]\n"
+    "smlal2 v22.4s, v1.8h, v27.8h\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v19.4s, v16.8h, v27.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal v24.4s, v16.4h, v30.4h\n"
+    "smlal v23.4s, v4.4h, v30.4h\n"
+    "smlal2 v15.4s, v25.8h, v31.8h\n"
+    "smlal v7.4s, v10.4h, v29.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal2 v22.4s, v16.8h, v30.8h\n"
+    "smlal2 v19.4s, v4.8h, v30.8h\n"
+    "smlal v20.4s, v10.4h, v31.4h\n"
+    "smlal v24.4s, v4.4h, v31.4h\n"
+    "smlal v23.4s, v17.4h, v31.4h\n"
+    "smlal2 v15.4s, v10.8h, v29.8h\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+    "smlal2 v5.4s, v10.8h, v31.8h\n"
+    "smlal2 v22.4s, v4.8h, v31.8h\n"
+    "and v27.16b, v7.16b, v28.16b\n"
+    "smlal2 v19.4s, v17.8h, v31.8h\n"
+    "smlal v20.4s, v9.4h, v29.4h\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "smlal v24.4s, v17.4h, v29.4h\n"
+    "smlal v23.4s, v6.4h, v29.4h\n"
+    "sqadd v7.4s, v7.4s, v27.4s\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal2 v5.4s, v9.8h, v29.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v2.4s\n"
+    "smlal2 v22.4s, v17.8h, v29.8h\n"
+    "smlal2 v19.4s, v6.8h, v29.8h\n"
+    "and v9.16b, v15.16b, v14.16b\n"
+    "smlal v20.4s, v12.4h, v21.4h\n"
+    "smlal v24.4s, v6.4h, v21.4h\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "smlal v23.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v12.8h, v21.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "smlal2 v22.4s, v6.8h, v21.8h\n"
+    "smlal2 v19.4s, v3.8h, v21.8h\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "and v25.16b, v20.16b, v28.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v2.4s\n"
+    "and v10.16b, v24.16b, v28.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+    "and v21.16b, v23.16b, v28.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+    "sqadd v15.4s, v15.4s, v9.4s\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "and v9.16b, v5.16b, v14.16b\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "and v12.16b, v22.16b, v14.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v17.16b, v19.16b, v14.16b\n"
+    "sqadd v20.4s, v20.4s, v25.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v10.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v21.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v28.4s\n"
+    "srshl v20.4s, v20.4s, v28.4s\n"
+    "sqadd v5.4s, v5.4s, v9.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqadd v22.4s, v22.4s, v12.4s\n"
+    "srshl v23.4s, v23.4s, v28.4s\n"
+    "sqadd v19.4s, v19.4s, v17.4s\n"
+    "srshl v15.4s, v15.4s, v14.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v14.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v14.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str d7, [x17, x4]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str d20, [x16, x4]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d24, [x15, x4]\n"
+    "str d23, [x14, x4]\n"
+    "ldr q7, [x20, #0x0]\n"
+    "ldr q15, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "add x4, x4, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ssubl v14.8h, v14.8b, v13.8b\n"
+    "ldr d31, [x9, x3]\n"
+    "ldr d17, [x28, x3]\n"
+    "ssubl v10.8h, v10.8b, v13.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ldr d30, [x27, x3]\n"
+    "ldr d16, [x26, x3]\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "ssubl v31.8h, v31.8b, v18.8b\n"
+    "ldr d3, [x25, x3]\n"
+    "ldr d4, [x24, x3]\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "ssubl v30.8h, v30.8b, v18.8b\n"
+    "ldr d25, [x23, x3]\n"
+    "ldr d9, [x22, x3]\n"
+    "ssubl v16.8h, v16.8b, v18.8b\n"
+    "ssubl v3.8h, v3.8b, v18.8b\n"
+    "ldr d29, [x21, x3]\n"
+    "ldr d28, [x20, x3]\n"
+    "ssubl v4.8h, v4.8b, v18.8b\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "ssubl v29.8h, v29.8b, v18.8b\n"
+    "ssubl v28.8h, v28.8b, v18.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr d27, [x6, #0x28]\n"
+    "ldr d1, [x6, #0x30]\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "ldr d2, [x6, #0x38]\n"
+    "ldr d31, [x6, #0x40]\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "ldr d8, [x6, #0x48]\n"
+    "ldr x22, [x5, #0x50]\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "ldr x20, [x5, #0x58]\n"
+    "ldr x21, [x5, #0x60]\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "ssubl v6.8h, v6.8b, v18.8b\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x22, [x5, #0x70]\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "ssubl v3.8h, v3.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "ldr d14, [x20, x3]\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "ssubl v14.8h, v14.8b, v18.8b\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal v23.4s, v17.4h, v10.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x21, [x5, #0x78]\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x20, [x5, #0x80]\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "ldr d25, [x22, x3]\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v10.8h\n"
+    "ldr d10, [x21, x3]\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "ssubl v10.8h, v10.8b, v18.8b\n"
+    "smlal v24.4s, v17.4h, v21.4h\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x24, [x5, #0x88]\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "smlal v7.4s, v30.4h, v27.4h\n"
+    "ldr x23, [x5, #0x90]\n"
+    "ldr x22, [x5, #0x98]\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "ldr d9, [x20, x3]\n"
+    "smlal2 v22.4s, v17.8h, v21.8h\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "ldr d21, [x6, #0x50]\n"
+    "smlal v20.4s, v3.4h, v12.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "ldr x21, [x5, #0xa0]\n"
+    "ldr x20, [x5, #0xa8]\n"
+    "smlal2 v15.4s, v30.8h, v27.8h\n"
+    "ldr d30, [x24, x3]\n"
+    "smlal v7.4s, v16.4h, v1.4h\n"
+    "ssubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v12.8h\n"
+    "ldr d3, [x6, #0x58]\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "ldr d12, [x23, x3]\n"
+    "smlal v20.4s, v16.4h, v27.4h\n"
+    "ssubl v12.8h, v12.8b, v18.8b\n"
+    "smlal v24.4s, v28.4h, v27.4h\n"
+    "smlal v23.4s, v14.4h, v27.4h\n"
+    "ldr x13, [x5, #0xb0]\n"
+    "ldr x12, [x5, #0xb8]\n"
+    "smlal2 v15.4s, v16.8h, v1.8h\n"
+    "smlal v7.4s, v4.4h, v2.4h\n"
+    "ldr x11, [x5, #0xc0]\n"
+    "ldr x10, [x5, #0xc8]\n"
+    "smlal2 v5.4s, v16.8h, v27.8h\n"
+    "ldr d16, [x22, x3]\n"
+    "smlal2 v22.4s, v28.8h, v27.8h\n"
+    "ssubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v27.8h\n"
+    "ldr d27, [x6, #0x60]\n"
+    "smlal v20.4s, v4.4h, v1.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v1.4h\n"
+    "smlal v23.4s, v25.4h, v1.4h\n"
+    "ldr x9, [x5, #0xd0]\n"
+    "ldr x28, [x5, #0xd8]\n"
+    "smlal2 v15.4s, v4.8h, v2.8h\n"
+    "smlal v7.4s, v17.4h, v31.4h\n"
+    "ldr x27, [x5, #0xe0]\n"
+    "ldr x26, [x5, #0xe8]\n"
+    "smlal2 v5.4s, v4.8h, v1.8h\n"
+    "ldr d4, [x21, x3]\n"
+    "smlal2 v22.4s, v14.8h, v1.8h\n"
+    "ssubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v1.8h\n"
+    "ldr d1, [x6, #0x68]\n"
+    "smlal v20.4s, v17.4h, v2.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v2.4h\n"
+    "smlal v23.4s, v10.4h, v2.4h\n"
+    "ldr x25, [x5, #0xf0]\n"
+    "ldr x24, [x5, #0xf8]\n"
+    "smlal2 v15.4s, v17.8h, v31.8h\n"
+    "smlal v7.4s, v6.4h, v8.4h\n"
+    "ldr x23, [x5, #0x100]\n"
+    "ldr x22, [x5, #0x108]\n"
+    "smlal2 v5.4s, v17.8h, v2.8h\n"
+    "ldr d17, [x20, x3]\n"
+    "smlal2 v22.4s, v25.8h, v2.8h\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v2.8h\n"
+    "ldr d2, [x6, #0x70]\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v31.4h\n"
+    "smlal v23.4s, v9.4h, v31.4h\n"
+    "ldr x21, [x5, #0x110]\n"
+    "ldr x20, [x5, #0x118]\n"
+    "smlal2 v15.4s, v6.8h, v8.8h\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "tst x1, #0x7\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "ldr d6, [x13, x3]\n"
+    "smlal2 v22.4s, v10.8h, v31.8h\n"
+    "ssubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v31.8h\n"
+    "ldr d31, [x6, #0x78]\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v9.4h, v8.4h\n"
+    "smlal v23.4s, v30.4h, v8.4h\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "ldr d28, [x12, x3]\n"
+    "smlal v7.4s, v14.4h, v3.4h\n"
+    "ssubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v5.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x6, #0x80]\n"
+    "smlal2 v22.4s, v9.8h, v8.8h\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "ldr d8, [x11, x3]\n"
+    "smlal v20.4s, v14.4h, v21.4h\n"
+    "ssubl v8.8h, v8.8b, v18.8b\n"
+    "smlal v24.4s, v12.4h, v21.4h\n"
+    "smlal v23.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v14.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v27.4h\n"
+    "smlal2 v5.4s, v14.8h, v21.8h\n"
+    "ldr d14, [x10, x3]\n"
+    "smlal2 v22.4s, v12.8h, v21.8h\n"
+    "ssubl v14.8h, v14.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v21.8h\n"
+    "ldr d21, [x6, #0x88]\n"
+    "smlal v20.4s, v25.4h, v3.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v16.4h, v3.4h\n"
+    "smlal v23.4s, v4.4h, v3.4h\n"
+    "smlal2 v15.4s, v25.8h, v27.8h\n"
+    "smlal v7.4s, v10.4h, v1.4h\n"
+    "smlal2 v5.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x9, x3]\n"
+    "smlal2 v22.4s, v16.8h, v3.8h\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v3.8h\n"
+    "ldr d3, [x6, #0x90]\n"
+    "smlal v20.4s, v10.4h, v27.4h\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v4.4h, v27.4h\n"
+    "smlal v23.4s, v17.4h, v27.4h\n"
+    "smlal2 v15.4s, v10.8h, v1.8h\n"
+    "smlal v7.4s, v9.4h, v2.4h\n"
+    "smlal2 v5.4s, v10.8h, v27.8h\n"
+    "ldr d10, [x28, x3]\n"
+    "smlal2 v22.4s, v4.8h, v27.8h\n"
+    "ssubl v10.8h, v10.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v27.8h\n"
+    "ldr d27, [x6, #0x98]\n"
+    "smlal v20.4s, v9.4h, v1.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v17.4h, v1.4h\n"
+    "smlal v23.4s, v6.4h, v1.4h\n"
+    "smlal2 v15.4s, v9.8h, v2.8h\n"
+    "smlal v7.4s, v12.4h, v31.4h\n"
+    "smlal2 v5.4s, v9.8h, v1.8h\n"
+    "ldr d9, [x27, x3]\n"
+    "smlal2 v22.4s, v17.8h, v1.8h\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v1.8h\n"
+    "ldr d1, [x6, #0xa0]\n"
+    "smlal v20.4s, v30.4h, v2.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v2.4h\n"
+    "smlal v23.4s, v28.4h, v2.4h\n"
+    "smlal2 v15.4s, v12.8h, v31.8h\n"
+    "ldr d12, [x26, x3]\n"
+    "smlal v7.4s, v16.4h, v29.4h\n"
+    "ssubl v12.8h, v12.8b, v18.8b\n"
+    "smlal2 v5.4s, v30.8h, v2.8h\n"
+    "ldr d30, [x6, #0xa8]\n"
+    "smlal2 v22.4s, v6.8h, v2.8h\n"
+    "ssubl v30.8h, v30.8b, v13.8b\n"
+    "smlal2 v19.4s, v28.8h, v2.8h\n"
+    "ldr d2, [x25, x3]\n"
+    "smlal v20.4s, v16.4h, v31.4h\n"
+    "ssubl v2.8h, v2.8b, v18.8b\n"
+    "smlal v24.4s, v8.4h, v31.4h\n"
+    "smlal v23.4s, v14.4h, v31.4h\n"
+    "smlal2 v15.4s, v16.8h, v29.8h\n"
+    "smlal v7.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v16.8h, v31.8h\n"
+    "ldr d16, [x24, x3]\n"
+    "smlal2 v22.4s, v8.8h, v31.8h\n"
+    "ssubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v31.8h\n"
+    "ldr d31, [x6, #0xb0]\n"
+    "smlal v20.4s, v4.4h, v29.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v29.4h\n"
+    "smlal v23.4s, v25.4h, v29.4h\n"
+    "smlal2 v15.4s, v4.8h, v21.8h\n"
+    "smlal v7.4s, v17.4h, v3.4h\n"
+    "smlal2 v5.4s, v4.8h, v29.8h\n"
+    "ldr d4, [x23, x3]\n"
+    "smlal2 v22.4s, v14.8h, v29.8h\n"
+    "ssubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v29.8h\n"
+    "ldr d29, [x6, #0xb8]\n"
+    "smlal v20.4s, v17.4h, v21.4h\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v15.4s, v17.8h, v3.8h\n"
+    "smlal v7.4s, v6.4h, v27.4h\n"
+    "smlal2 v5.4s, v17.8h, v21.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "ldr d21, [x6, #0xc0]\n"
+    "smlal v20.4s, v6.4h, v3.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v3.4h\n"
+    "smlal v23.4s, v9.4h, v3.4h\n"
+    "smlal2 v15.4s, v6.8h, v27.8h\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "smlal2 v5.4s, v6.8h, v3.8h\n"
+    "ldr d6, [x21, x3]\n"
+    "smlal2 v22.4s, v10.8h, v3.8h\n"
+    "ssubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v3.8h\n"
+    "ldr d3, [x20, x3]\n"
+    "smlal v20.4s, v28.4h, v27.4h\n"
+    "ssubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v24.4s, v9.4h, v27.4h\n"
+    "smlal v23.4s, v12.4h, v27.4h\n"
+    "add x3, x3, #0x8\n"
+    "smlal2 v15.4s, v8.8h, v1.8h\n"
+    "ldr q8, [x7, #0x0]\n"
+    "smlal v7.4s, v14.4h, v30.4h\n"
+    "smlal2 v5.4s, v28.8h, v27.8h\n"
+    "ldr q28, [x8, #0x0]\n"
+    "smlal2 v22.4s, v9.8h, v27.8h\n"
+    "smlal2 v19.4s, v12.8h, v27.8h\n"
+    "ldr q27, [x7, #0x10]\n"
+    "smlal v20.4s, v14.4h, v1.4h\n"
+    "add x7, x7, #0x20\n"
+    "smlal v24.4s, v2.4h, v1.4h\n"
+    "smlal v23.4s, v16.4h, v1.4h\n"
+    "smlal2 v15.4s, v14.8h, v30.8h\n"
+    "smlal v7.4s, v25.4h, v31.4h\n"
+    "smlal2 v5.4s, v14.8h, v1.8h\n"
+    "ldr q14, [x8, #0x10]\n"
+    "smlal2 v22.4s, v2.8h, v1.8h\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v19.4s, v16.8h, v1.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal v24.4s, v16.4h, v30.4h\n"
+    "smlal v23.4s, v4.4h, v30.4h\n"
+    "smlal2 v15.4s, v25.8h, v31.8h\n"
+    "smlal v7.4s, v10.4h, v29.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal2 v22.4s, v16.8h, v30.8h\n"
+    "smlal2 v19.4s, v4.8h, v30.8h\n"
+    "smlal v20.4s, v10.4h, v31.4h\n"
+    "smlal v24.4s, v4.4h, v31.4h\n"
+    "smlal v23.4s, v17.4h, v31.4h\n"
+    "smlal2 v15.4s, v10.8h, v29.8h\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+    "smlal2 v5.4s, v10.8h, v31.8h\n"
+    "smlal2 v22.4s, v4.8h, v31.8h\n"
+    "and v4.16b, v7.16b, v28.16b\n"
+    "smlal2 v19.4s, v17.8h, v31.8h\n"
+    "smlal v20.4s, v9.4h, v29.4h\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "smlal v24.4s, v17.4h, v29.4h\n"
+    "smlal v23.4s, v6.4h, v29.4h\n"
+    "sqadd v7.4s, v7.4s, v4.4s\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal2 v5.4s, v9.8h, v29.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v27.4s\n"
+    "smlal2 v22.4s, v17.8h, v29.8h\n"
+    "smlal2 v19.4s, v6.8h, v29.8h\n"
+    "and v30.16b, v15.16b, v14.16b\n"
+    "smlal v20.4s, v12.4h, v21.4h\n"
+    "smlal v24.4s, v6.4h, v21.4h\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "smlal v23.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v12.8h, v21.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "smlal2 v22.4s, v6.8h, v21.8h\n"
+    "smlal2 v19.4s, v3.8h, v21.8h\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "and v3.16b, v20.16b, v28.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v27.4s\n"
+    "and v25.16b, v24.16b, v28.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+    "and v16.16b, v23.16b, v28.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+    "sqadd v15.4s, v15.4s, v30.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "and v4.16b, v5.16b, v14.16b\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "and v10.16b, v22.16b, v14.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v12.16b, v19.16b, v14.16b\n"
+    "sqadd v20.4s, v20.4s, v3.4s\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v25.4s\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v28.4s\n"
+    "srshl v20.4s, v20.4s, v28.4s\n"
+    "sqadd v5.4s, v5.4s, v4.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqadd v22.4s, v22.4s, v10.4s\n"
+    "srshl v23.4s, v23.4s, v28.4s\n"
+    "sqadd v19.4s, v19.4s, v12.4s\n"
+    "srshl v15.4s, v15.4s, v14.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v14.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v14.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str d7, [x17, x4]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str d20, [x16, x4]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d24, [x15, x4]\n"
+    "str d23, [x14, x4]\n"
+    "add x4, x4, #0x8\n"
+    "beq 124f\n"
+    "add x6, x6, #0xc8\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x1, #2, 5f\n"
+    "ld1 { v7.4s }, [x20], #0x10\n"
+    "tbz x1, #1, 4f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz x1, #0, 7f\n"
+    "ld1 { v15.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 7f\n"
+    "ld1 { v15.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x1, #1, 6f\n"
+    "ld1 { v7.d }[0], [x20], #0x8\n"
+    "tbz x1, #0, 7f\n"
+    "ld1 { v7.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 7f\n"
+    "ld1 { v7.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ssubl v14.8h, v14.8b, v13.8b\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "ssubl v10.8h, v10.8b, v13.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "add x9, x9, x3\n"
+    "add x28, x28, x3\n"
+    "add x27, x27, x3\n"
+    "add x26, x26, x3\n"
+    "add x25, x25, x3\n"
+    "add x24, x24, x3\n"
+    "add x23, x23, x3\n"
+    "add x22, x22, x3\n"
+    "add x21, x21, x3\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 9f\n"
+    "ld1 { v31.s }[0], [x9], #0x4\n"
+    "ld1 { v17.s }[0], [x28], #0x4\n"
+    "ld1 { v30.s }[0], [x27], #0x4\n"
+    "ld1 { v16.s }[0], [x26], #0x4\n"
+    "ld1 { v3.s }[0], [x25], #0x4\n"
+    "ld1 { v4.s }[0], [x24], #0x4\n"
+    "ld1 { v25.s }[0], [x23], #0x4\n"
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v29.s }[0], [x21], #0x4\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 8f\n"
+    "ld1 { v31.h }[2], [x9], #0x2\n"
+    "ld1 { v17.h }[2], [x28], #0x2\n"
+    "ld1 { v30.h }[2], [x27], #0x2\n"
+    "ld1 { v16.h }[2], [x26], #0x2\n"
+    "ld1 { v3.h }[2], [x25], #0x2\n"
+    "ld1 { v4.h }[2], [x24], #0x2\n"
+    "ld1 { v25.h }[2], [x23], #0x2\n"
+    "ld1 { v9.h }[2], [x22], #0x2\n"
+    "ld1 { v29.h }[2], [x21], #0x2\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[6], [x9]\n"
+    "ld1 { v17.b }[6], [x28]\n"
+    "ld1 { v30.b }[6], [x27]\n"
+    "ld1 { v16.b }[6], [x26]\n"
+    "ld1 { v3.b }[6], [x25]\n"
+    "ld1 { v4.b }[6], [x24]\n"
+    "ld1 { v25.b }[6], [x23]\n"
+    "ld1 { v9.b }[6], [x22]\n"
+    "ld1 { v29.b }[6], [x21]\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[4], [x9]\n"
+    "ld1 { v17.b }[4], [x28]\n"
+    "ld1 { v30.b }[4], [x27]\n"
+    "ld1 { v16.b }[4], [x26]\n"
+    "ld1 { v3.b }[4], [x25]\n"
+    "ld1 { v4.b }[4], [x24]\n"
+    "ld1 { v25.b }[4], [x23]\n"
+    "ld1 { v9.b }[4], [x22]\n"
+    "ld1 { v29.b }[4], [x21]\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x1, #1, 10f\n"
+    "ld1 { v31.h }[0], [x9], #0x2\n"
+    "ld1 { v17.h }[0], [x28], #0x2\n"
+    "ld1 { v30.h }[0], [x27], #0x2\n"
+    "ld1 { v16.h }[0], [x26], #0x2\n"
+    "ld1 { v3.h }[0], [x25], #0x2\n"
+    "ld1 { v4.h }[0], [x24], #0x2\n"
+    "ld1 { v25.h }[0], [x23], #0x2\n"
+    "ld1 { v9.h }[0], [x22], #0x2\n"
+    "ld1 { v29.h }[0], [x21], #0x2\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[2], [x9]\n"
+    "ld1 { v17.b }[2], [x28]\n"
+    "ld1 { v30.b }[2], [x27]\n"
+    "ld1 { v16.b }[2], [x26]\n"
+    "ld1 { v3.b }[2], [x25]\n"
+    "ld1 { v4.b }[2], [x24]\n"
+    "ld1 { v25.b }[2], [x23]\n"
+    "ld1 { v9.b }[2], [x22]\n"
+    "ld1 { v29.b }[2], [x21]\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[0], [x9]\n"
+    "ld1 { v17.b }[0], [x28]\n"
+    "ld1 { v30.b }[0], [x27]\n"
+    "ld1 { v16.b }[0], [x26]\n"
+    "ld1 { v3.b }[0], [x25]\n"
+    "ld1 { v4.b }[0], [x24]\n"
+    "ld1 { v25.b }[0], [x23]\n"
+    "ld1 { v9.b }[0], [x22]\n"
+    "ld1 { v29.b }[0], [x21]\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ssubl v31.8h, v31.8b, v18.8b\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "ldr x20, [x5, #0x50]\n"
+    "ssubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "ssubl v16.8h, v16.8b, v18.8b\n"
+    "add x20, x20, x3\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "ssubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "ssubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "ssubl v29.8h, v29.8b, v18.8b\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "ssubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "tbz x1, #2, 13f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 12f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x1, #1, 14f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "ssubl v27.8h, v27.8b, v18.8b\n"
+    "ldr x20, [x5, #0x58]\n"
+    "smlal v23.4s, v27.4h, v10.4h\n"
+    "smlal2 v19.4s, v27.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "smlal v24.4s, v27.4h, v21.4h\n"
+    "smlal2 v22.4s, v27.8h, v21.8h\n"
+    "tbz x1, #2, 17f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 16f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x1, #1, 18f\n"
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "ssubl v6.8h, v6.8b, v18.8b\n"
+    "ldr x20, [x5, #0x60]\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 21f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 20f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 5): Bit 2: Unset
+    "tbz x1, #1, 22f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (0, 5): Bit 2: End
+    "ldr d14, [x6, #0x28]\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "smlal v20.4s, v9.4h, v12.4h\n"
+    "smlal2 v5.4s, v9.8h, v12.8h\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "ssubl v14.8h, v14.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v30.4h, v14.4h\n"
+    "smlal2 v15.4s, v30.8h, v14.8h\n"
+    "smlal v20.4s, v16.4h, v14.4h\n"
+    "smlal2 v5.4s, v16.8h, v14.8h\n"
+    "smlal v24.4s, v28.4h, v14.4h\n"
+    "smlal2 v22.4s, v28.8h, v14.8h\n"
+    "tbz x1, #2, 25f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 24f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x1, #1, 26f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (2, 1): Bit 2: End
+    "ldr d21, [x6, #0x30]\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0x70]\n"
+    "smlal v23.4s, v25.4h, v14.4h\n"
+    "smlal2 v19.4s, v25.8h, v14.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v16.8h, v21.8h\n"
+    "smlal v20.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v4.8h, v21.8h\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "tbz x1, #2, 29f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 28f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x1, #1, 30f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "ldr d9, [x6, #0x38]\n"
+    "ssubl v10.8h, v10.8b, v18.8b\n"
+    "ssubl v9.8h, v9.8b, v13.8b\n"
+    "ldr x20, [x5, #0x78]\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v4.4h, v9.4h\n"
+    "smlal2 v15.4s, v4.8h, v9.8h\n"
+    "smlal v20.4s, v27.4h, v9.4h\n"
+    "smlal2 v5.4s, v27.8h, v9.8h\n"
+    "smlal v24.4s, v10.4h, v9.4h\n"
+    "smlal2 v22.4s, v10.8h, v9.8h\n"
+    "tbz x1, #2, 33f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 32f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x1, #1, 34f\n"
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (2, 3): Bit 2: End
+    "ldr d31, [x6, #0x40]\n"
+    "ssubl v12.8h, v12.8b, v18.8b\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x20, [x5, #0x80]\n"
+    "smlal v23.4s, v12.4h, v9.4h\n"
+    "smlal2 v19.4s, v12.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v27.4h, v31.4h\n"
+    "smlal2 v15.4s, v27.8h, v31.8h\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "smlal v24.4s, v12.4h, v31.4h\n"
+    "smlal2 v22.4s, v12.8h, v31.8h\n"
+    "tbz x1, #2, 37f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 36f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x1, #1, 38f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 4): Bit 2: End
+    "ldr d16, [x6, #0x48]\n"
+    "ssubl v8.8h, v8.8b, v18.8b\n"
+    "ssubl v16.8h, v16.8b, v13.8b\n"
+    "ldr x20, [x5, #0x88]\n"
+    "smlal v23.4s, v8.4h, v31.4h\n"
+    "smlal2 v19.4s, v8.8h, v31.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v6.4h, v16.4h\n"
+    "smlal2 v15.4s, v6.8h, v16.8h\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "smlal2 v5.4s, v29.8h, v16.8h\n"
+    "smlal v24.4s, v8.4h, v16.4h\n"
+    "smlal2 v22.4s, v8.8h, v16.8h\n"
+    "tbz x1, #2, 41f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 40f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 5): Bit 2: Unset
+    "tbz x1, #1, 42f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (2, 5): Bit 2: End
+    "ldr d21, [x6, #0x50]\n"
+    "ssubl v27.8h, v27.8b, v18.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0x90]\n"
+    "smlal v23.4s, v27.4h, v16.4h\n"
+    "smlal2 v19.4s, v27.8h, v16.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "smlal v20.4s, v25.4h, v21.4h\n"
+    "smlal2 v5.4s, v25.8h, v21.8h\n"
+    "tbz x1, #2, 45f\n"
+    "ld1 { v31.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 44f\n"
+    "ld1 { v31.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x1, #1, 46f\n"
+    "ld1 { v31.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (3, 0): Bit 2: End
+    "ssubl v31.8h, v31.8b, v18.8b\n"
+    "ldr x20, [x5, #0x98]\n"
+    "smlal v24.4s, v31.4h, v21.4h\n"
+    "smlal2 v22.4s, v31.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 49f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 48f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x1, #1, 50f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "ldr d2, [x6, #0x58]\n"
+    "ssubl v28.8h, v28.8b, v18.8b\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x20, [x5, #0xa0]\n"
+    "smlal v23.4s, v28.4h, v21.4h\n"
+    "smlal2 v19.4s, v28.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v25.4h, v2.4h\n"
+    "smlal2 v15.4s, v25.8h, v2.8h\n"
+    "smlal v20.4s, v10.4h, v2.4h\n"
+    "smlal2 v5.4s, v10.8h, v2.8h\n"
+    "smlal v24.4s, v28.4h, v2.4h\n"
+    "smlal2 v22.4s, v28.8h, v2.8h\n"
+    "tbz x1, #2, 53f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 52f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x1, #1, 54f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "ldr d25, [x6, #0x60]\n"
+    "ssubl v21.8h, v21.8b, v18.8b\n"
+    "ssubl v25.8h, v25.8b, v13.8b\n"
+    "ldr x20, [x5, #0xa8]\n"
+    "smlal v23.4s, v21.4h, v2.4h\n"
+    "smlal2 v19.4s, v21.8h, v2.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v10.4h, v25.4h\n"
+    "smlal2 v15.4s, v10.8h, v25.8h\n"
+    "smlal v20.4s, v12.4h, v25.4h\n"
+    "smlal2 v5.4s, v12.8h, v25.8h\n"
+    "smlal v24.4s, v21.4h, v25.4h\n"
+    "smlal2 v22.4s, v21.8h, v25.8h\n"
+    "tbz x1, #2, 57f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 56f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x1, #1, 58f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "59:"  // Oddments: Load (3, 3): Bit 2: End
+    "ldr d1, [x6, #0x68]\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x20, [x5, #0xb0]\n"
+    "smlal v23.4s, v9.4h, v25.4h\n"
+    "smlal2 v19.4s, v9.8h, v25.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v12.4h, v1.4h\n"
+    "smlal2 v15.4s, v12.8h, v1.8h\n"
+    "smlal v20.4s, v8.4h, v1.4h\n"
+    "smlal2 v5.4s, v8.8h, v1.8h\n"
+    "smlal v24.4s, v9.4h, v1.4h\n"
+    "smlal2 v22.4s, v9.8h, v1.8h\n"
+    "tbz x1, #2, 61f\n"
+    "ld1 { v3.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 60f\n"
+    "ld1 { v3.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[6], [x20]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[4], [x20]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x1, #1, 62f\n"
+    "ld1 { v3.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[2], [x20]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[0], [x20]\n"
+    "63:"  // Oddments: Load (3, 4): Bit 2: End
+    "ldr d16, [x6, #0x70]\n"
+    "ssubl v3.8h, v3.8b, v18.8b\n"
+    "ssubl v16.8h, v16.8b, v13.8b\n"
+    "ldr x20, [x5, #0xb8]\n"
+    "smlal v23.4s, v3.4h, v1.4h\n"
+    "smlal2 v19.4s, v3.8h, v1.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "smlal2 v15.4s, v8.8h, v16.8h\n"
+    "smlal v20.4s, v27.4h, v16.4h\n"
+    "smlal2 v5.4s, v27.8h, v16.8h\n"
+    "smlal v24.4s, v3.4h, v16.4h\n"
+    "smlal2 v22.4s, v3.8h, v16.8h\n"
+    "tbz x1, #2, 65f\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 64f\n"
+    "ld1 { v14.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[6], [x20]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[4], [x20]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 5): Bit 2: Unset
+    "tbz x1, #1, 66f\n"
+    "ld1 { v14.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[2], [x20]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[0], [x20]\n"
+    "67:"  // Oddments: Load (3, 5): Bit 2: End
+    "ldr d17, [x6, #0x78]\n"
+    "ssubl v14.8h, v14.8b, v18.8b\n"
+    "ssubl v17.8h, v17.8b, v13.8b\n"
+    "ldr x20, [x5, #0xc0]\n"
+    "smlal v23.4s, v14.4h, v16.4h\n"
+    "smlal2 v19.4s, v14.8h, v16.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v31.4h, v17.4h\n"
+    "smlal2 v15.4s, v31.8h, v17.8h\n"
+    "smlal v20.4s, v28.4h, v17.4h\n"
+    "smlal2 v5.4s, v28.8h, v17.8h\n"
+    "tbz x1, #2, 69f\n"
+    "ld1 { v1.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 68f\n"
+    "ld1 { v1.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[6], [x20]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[4], [x20]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x1, #1, 70f\n"
+    "ld1 { v1.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[0], [x20]\n"
+    "71:"  // Oddments: Load (4, 0): Bit 2: End
+    "ssubl v1.8h, v1.8b, v18.8b\n"
+    "ldr x20, [x5, #0xc8]\n"
+    "smlal v24.4s, v1.4h, v17.4h\n"
+    "smlal2 v22.4s, v1.8h, v17.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 73f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 72f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x1, #1, 74f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 1): Bit 2: End
+    "ldr d29, [x6, #0x80]\n"
+    "ssubl v16.8h, v16.8b, v18.8b\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "ldr x20, [x5, #0xd0]\n"
+    "smlal v23.4s, v16.4h, v17.4h\n"
+    "smlal2 v19.4s, v16.8h, v17.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v28.4h, v29.4h\n"
+    "smlal2 v15.4s, v28.8h, v29.8h\n"
+    "smlal v20.4s, v21.4h, v29.4h\n"
+    "smlal2 v5.4s, v21.8h, v29.8h\n"
+    "smlal v24.4s, v16.4h, v29.4h\n"
+    "smlal2 v22.4s, v16.8h, v29.8h\n"
+    "tbz x1, #2, 77f\n"
+    "ld1 { v30.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 76f\n"
+    "ld1 { v30.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[6], [x20]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[4], [x20]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x1, #1, 78f\n"
+    "ld1 { v30.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[2], [x20]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[0], [x20]\n"
+    "79:"  // Oddments: Load (4, 2): Bit 2: End
+    "ldr d12, [x6, #0x88]\n"
+    "ssubl v30.8h, v30.8b, v18.8b\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "ldr x20, [x5, #0xd8]\n"
+    "smlal v23.4s, v30.4h, v29.4h\n"
+    "smlal2 v19.4s, v30.8h, v29.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v21.4h, v12.4h\n"
+    "smlal2 v15.4s, v21.8h, v12.8h\n"
+    "smlal v20.4s, v9.4h, v12.4h\n"
+    "smlal2 v5.4s, v9.8h, v12.8h\n"
+    "smlal v24.4s, v30.4h, v12.4h\n"
+    "smlal2 v22.4s, v30.8h, v12.8h\n"
+    "tbz x1, #2, 81f\n"
+    "ld1 { v29.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 80f\n"
+    "ld1 { v29.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[6], [x20]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[4], [x20]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x1, #1, 82f\n"
+    "ld1 { v29.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[2], [x20]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[0], [x20]\n"
+    "83:"  // Oddments: Load (4, 3): Bit 2: End
+    "ldr d21, [x6, #0x90]\n"
+    "ssubl v29.8h, v29.8b, v18.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0xe0]\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal v20.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v3.8h, v21.8h\n"
+    "smlal v24.4s, v29.4h, v21.4h\n"
+    "smlal2 v22.4s, v29.8h, v21.8h\n"
+    "tbz x1, #2, 85f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 84f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 87f\n"
+    "84:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 87f\n"
+    "85:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x1, #1, 86f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 87f\n"
+    "86:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "87:"  // Oddments: Load (4, 4): Bit 2: End
+    "ldr d8, [x6, #0x98]\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x20, [x5, #0xe8]\n"
+    "smlal v23.4s, v25.4h, v21.4h\n"
+    "smlal2 v19.4s, v25.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v3.4h, v8.4h\n"
+    "smlal2 v15.4s, v3.8h, v8.8h\n"
+    "smlal v20.4s, v14.4h, v8.4h\n"
+    "smlal2 v5.4s, v14.8h, v8.8h\n"
+    "smlal v24.4s, v25.4h, v8.4h\n"
+    "smlal2 v22.4s, v25.8h, v8.8h\n"
+    "tbz x1, #2, 89f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 88f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 91f\n"
+    "88:"  // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 91f\n"
+    "89:"  // Oddments: Load (4, 5): Bit 2: Unset
+    "tbz x1, #1, 90f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 91f\n"
+    "90:"  // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "91:"  // Oddments: Load (4, 5): Bit 2: End
+    "ldr d9, [x6, #0xa0]\n"
+    "ssubl v21.8h, v21.8b, v18.8b\n"
+    "ssubl v9.8h, v9.8b, v13.8b\n"
+    "ldr x20, [x5, #0xf0]\n"
+    "smlal v23.4s, v21.4h, v8.4h\n"
+    "smlal2 v19.4s, v21.8h, v8.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v1.4h, v9.4h\n"
+    "smlal2 v15.4s, v1.8h, v9.8h\n"
+    "smlal v20.4s, v16.4h, v9.4h\n"
+    "smlal2 v5.4s, v16.8h, v9.8h\n"
+    "tbz x1, #2, 93f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 92f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[6], [x20]\n"
+    "b 95f\n"
+    "92:"  // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[4], [x20]\n"
+    "b 95f\n"
+    "93:"  // Oddments: Load (5, 0): Bit 2: Unset
+    "tbz x1, #1, 94f\n"
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[2], [x20]\n"
+    "b 95f\n"
+    "94:"  // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[0], [x20]\n"
+    "95:"  // Oddments: Load (5, 0): Bit 2: End
+    "ssubl v12.8h, v12.8b, v18.8b\n"
+    "ldr x20, [x5, #0xf8]\n"
+    "smlal v24.4s, v12.4h, v9.4h\n"
+    "smlal2 v22.4s, v12.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 97f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 96f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 99f\n"
+    "96:"  // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 99f\n"
+    "97:"  // Oddments: Load (5, 1): Bit 2: Unset
+    "tbz x1, #1, 98f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 99f\n"
+    "98:"  // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "99:"  // Oddments: Load (5, 1): Bit 2: End
+    "ldr d12, [x6, #0xa8]\n"
+    "ssubl v10.8h, v10.8b, v18.8b\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "ldr x20, [x5, #0x100]\n"
+    "smlal v23.4s, v10.4h, v9.4h\n"
+    "smlal2 v19.4s, v10.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v16.4h, v12.4h\n"
+    "smlal2 v15.4s, v16.8h, v12.8h\n"
+    "smlal v20.4s, v30.4h, v12.4h\n"
+    "smlal2 v5.4s, v30.8h, v12.8h\n"
+    "smlal v24.4s, v10.4h, v12.4h\n"
+    "smlal2 v22.4s, v10.8h, v12.8h\n"
+    "tbz x1, #2, 101f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 100f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 103f\n"
+    "100:"  // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 103f\n"
+    "101:"  // Oddments: Load (5, 2): Bit 2: Unset
+    "tbz x1, #1, 102f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 103f\n"
+    "102:"  // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "103:"  // Oddments: Load (5, 2): Bit 2: End
+    "ldr d28, [x6, #0xb0]\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "ssubl v28.8h, v28.8b, v13.8b\n"
+    "ldr x20, [x5, #0x108]\n"
+    "smlal v23.4s, v9.4h, v12.4h\n"
+    "smlal2 v19.4s, v9.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v30.4h, v28.4h\n"
+    "smlal2 v15.4s, v30.8h, v28.8h\n"
+    "smlal v20.4s, v29.4h, v28.4h\n"
+    "smlal2 v5.4s, v29.8h, v28.8h\n"
+    "smlal v24.4s, v9.4h, v28.4h\n"
+    "smlal2 v22.4s, v9.8h, v28.8h\n"
+    "tbz x1, #2, 105f\n"
+    "ld1 { v2.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 104f\n"
+    "ld1 { v2.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[6], [x20]\n"
+    "b 107f\n"
+    "104:"  // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[4], [x20]\n"
+    "b 107f\n"
+    "105:"  // Oddments: Load (5, 3): Bit 2: Unset
+    "tbz x1, #1, 106f\n"
+    "ld1 { v2.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[2], [x20]\n"
+    "b 107f\n"
+    "106:"  // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[0], [x20]\n"
+    "107:"  // Oddments: Load (5, 3): Bit 2: End
+    "ldr d30, [x6, #0xb8]\n"
+    "ssubl v2.8h, v2.8b, v18.8b\n"
+    "ssubl v30.8h, v30.8b, v13.8b\n"
+    "ldr x20, [x5, #0x110]\n"
+    "smlal v23.4s, v2.4h, v28.4h\n"
+    "smlal2 v19.4s, v2.8h, v28.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v29.4h, v30.4h\n"
+    "smlal2 v15.4s, v29.8h, v30.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal v24.4s, v2.4h, v30.4h\n"
+    "smlal2 v22.4s, v2.8h, v30.8h\n"
+    "tbz x1, #2, 109f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 108f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 111f\n"
+    "108:"  // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 111f\n"
+    "109:"  // Oddments: Load (5, 4): Bit 2: Unset
+    "tbz x1, #1, 110f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 111f\n"
+    "110:"  // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "111:"  // Oddments: Load (5, 4): Bit 2: End
+    "ldr d8, [x6, #0xc0]\n"
+    "ssubl v27.8h, v27.8b, v18.8b\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x20, [x5, #0x118]\n"
+    "smlal v23.4s, v27.4h, v30.4h\n"
+    "smlal2 v19.4s, v27.8h, v30.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v25.4h, v8.4h\n"
+    "smlal2 v15.4s, v25.8h, v8.8h\n"
+    "smlal v20.4s, v21.4h, v8.4h\n"
+    "smlal2 v5.4s, v21.8h, v8.8h\n"
+    "smlal v24.4s, v27.4h, v8.4h\n"
+    "smlal2 v22.4s, v27.8h, v8.8h\n"
+    "tbz x1, #2, 113f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 112f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 115f\n"
+    "112:"  // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 115f\n"
+    "113:"  // Oddments: Load (5, 5): Bit 2: Unset
+    "tbz x1, #1, 114f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 115f\n"
+    "114:"  // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "115:"  // Oddments: Load (5, 5): Bit 2: End
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "smlal v23.4s, v9.4h, v8.4h\n"
+    "smlal2 v19.4s, v9.8h, v8.8h\n"
+    "tbz x1, #2, 117f\n"
+    "ld1 { v30.4s }, [x7], #0x10\n"
+    "ld1 { v12.4s }, [x8], #0x10\n"
+    "tbz x1, #1, 116f\n"
+    "ld1 { v14.d }[0], [x7], #0x8\n"
+    "ld1 { v27.d }[0], [x8], #0x8\n"
+    "tbz x1, #0, 119f\n"
+    "ld1 { v14.s }[2], [x7]\n"
+    "ld1 { v27.s }[2], [x8]\n"
+    "b 119f\n"
+    "116:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 119f\n"
+    "ld1 { v14.s }[0], [x7]\n"
+    "ld1 { v27.s }[0], [x8]\n"
+    "b 119f\n"
+    "117:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x1, #1, 118f\n"
+    "ld1 { v30.d }[0], [x7], #0x8\n"
+    "ld1 { v12.d }[0], [x8], #0x8\n"
+    "tbz x1, #0, 119f\n"
+    "ld1 { v30.s }[2], [x7]\n"
+    "ld1 { v12.s }[2], [x8]\n"
+    "b 119f\n"
+    "118:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 119f\n"
+    "ld1 { v30.s }[0], [x7]\n"
+    "ld1 { v12.s }[0], [x8]\n"
+    "119:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v7.4s, v7.4s, v30.4s\n"
+    "and v16.16b, v7.16b, v12.16b\n"
+    "add x17, x17, x4\n"
+    "add x16, x16, x4\n"
+    "sqrdmulh v15.4s, v15.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add x15, x15, x4\n"
+    "add x14, x14, x4\n"
+    "and v2.16b, v15.16b, v27.16b\n"
+    "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+    "sqadd v7.4s, v7.4s, v16.4s\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "and v21.16b, v20.16b, v12.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+    "and v18.16b, v24.16b, v12.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v14.4s\n"
+    "and v31.16b, v23.16b, v12.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+    "sqadd v15.4s, v15.4s, v2.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v9.16b, v5.16b, v27.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "and v4.16b, v22.16b, v27.16b\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v28.16b, v19.16b, v27.16b\n"
+    "sqadd v20.4s, v20.4s, v21.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v18.4s\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v31.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v12.4s\n"
+    "srshl v20.4s, v20.4s, v12.4s\n"
+    "sqadd v5.4s, v5.4s, v9.4s\n"
+    "srshl v24.4s, v24.4s, v12.4s\n"
+    "sqadd v22.4s, v22.4s, v4.4s\n"
+    "srshl v23.4s, v23.4s, v12.4s\n"
+    "sqadd v19.4s, v19.4s, v28.4s\n"
+    "srshl v15.4s, v15.4s, v27.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v27.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v27.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v27.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "tbz x1, #2, 121f\n"
+    "st1 { v7.s }[0], [x17], #0x4\n"
+    "st1 { v20.s }[0], [x16], #0x4\n"
+    "st1 { v24.s }[0], [x15], #0x4\n"
+    "st1 { v23.s }[0], [x14], #0x4\n"
+    "tbz x1, #1, 120f\n"
+    "st1 { v7.h }[2], [x17], #0x2\n"
+    "st1 { v20.h }[2], [x16], #0x2\n"
+    "st1 { v24.h }[2], [x15], #0x2\n"
+    "st1 { v23.h }[2], [x14], #0x2\n"
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[6], [x17], #0x1\n"
+    "st1 { v20.b }[6], [x16], #0x1\n"
+    "st1 { v24.b }[6], [x15], #0x1\n"
+    "st1 { v23.b }[6], [x14], #0x1\n"
+    "b 123f\n"
+    "120:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[4], [x17], #0x1\n"
+    "st1 { v20.b }[4], [x16], #0x1\n"
+    "st1 { v24.b }[4], [x15], #0x1\n"
+    "st1 { v23.b }[4], [x14], #0x1\n"
+    "b 123f\n"
+    "121:"  // Oddments: Bit 2: Unset
+    "tbz x1, #1, 122f\n"
+    "st1 { v7.h }[0], [x17], #0x2\n"
+    "st1 { v20.h }[0], [x16], #0x2\n"
+    "st1 { v24.h }[0], [x15], #0x2\n"
+    "st1 { v23.h }[0], [x14], #0x2\n"
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[2], [x17], #0x1\n"
+    "st1 { v20.b }[2], [x16], #0x1\n"
+    "st1 { v24.b }[2], [x15], #0x1\n"
+    "st1 { v23.b }[2], [x14], #0x1\n"
+    "b 123f\n"
+    "122:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[0], [x17], #0x1\n"
+    "st1 { v20.b }[0], [x16], #0x1\n"
+    "st1 { v24.b }[0], [x15], #0x1\n"
+    "st1 { v23.b }[0], [x14], #0x1\n"
+    "123:"  // Oddments: Bit 2: End
+    "124:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..9c92a9dd46
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+class a64_s8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  KernelType kernel = a64_s8q_nhwc_generic_output9_mla_depthfirst_impl;
+
+  public:
+  a64_s8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<int8_t, int8_t, int8_t, int32_t>(9, arm_gemm::VLType::None) {}
+
+  KernelType get_kernel() const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..77b7d231e0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const void *params,
+  const arm_gemm::Requantize32& qp,
+  const unsigned int n_points,
+  const unsigned int n_channels
+)
+{
+  __asm__ __volatile__(
+    "lsr x9, %x[n_channels], #0x2\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v8.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v7.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v6.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v5.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v4.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "ld1r { v3.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v2.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "ld1r { v1.4s }, [x20]\n"
+    "mov x11, #0x0\n"
+    "cbz x9, 6f\n"
+    "1:"  // Channel loop
+    "movi v23.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x20, x11, #0x2\n"
+    "ldr q23, [%x[bias], x20]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "ldr s0, [%x[params]], #0x4\n"
+    "mov x25, %x[inptrs]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "subs x24, %x[n_points], #0x1\n"
+    "ldr s14, [x21, x11]\n"
+    "ldr s15, [x20, x11]\n"
+    "mov v24.16b, v23.16b\n"
+    "mov v25.16b, v23.16b\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s16, [x21, x11]\n"
+    "mov v26.16b, v23.16b\n"
+    "mov v27.16b, v23.16b\n"
+    "ldr s17, [x20, x11]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "ldr s18, [x21, x11]\n"
+    "ldr s19, [x20, x11]\n"
+    "mov v30.16b, v23.16b\n"
+    "mov v31.16b, v23.16b\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s20, [x21, x11]\n"
+    "ssubl v0.8h, v0.8b, v5.8b\n"
+    "ssubl v14.8h, v14.8b, v6.8b\n"
+    "ldr s21, [x20, x11]\n"
+    "ldr x20, [x25], #0x8\n"
+    "ssubl v15.8h, v15.8b, v6.8b\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr s22, [x20, x11]\n"
+    "ssubl v17.8h, v17.8b, v6.8b\n"
+    "ssubl v18.8h, v18.8b, v6.8b\n"
+    "ssubl v19.8h, v19.8b, v6.8b\n"
+    "ssubl v20.8h, v20.8b, v6.8b\n"
+    "ssubl v21.8h, v21.8b, v6.8b\n"
+    "ssubl v22.8h, v22.8b, v6.8b\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "ldp x23, x22, [x25], #0x10\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "ldr s14, [x23, x11]\n"
+    "ldr s15, [x22, x11]\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "ldr s16, [x21, x11]\n"
+    "ldr s17, [x20, x11]\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s18, [x21, x11]\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "ldr s19, [x20, x11]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "subs x24, x24, #0x1\n"
+    "ldr s0, [%x[params]], #0x4\n"
+    "ldr s20, [x21, x11]\n"
+    "ssubl v0.8h, v0.8b, v5.8b\n"
+    "ssubl v14.8h, v14.8b, v6.8b\n"
+    "ldr s21, [x20, x11]\n"
+    "ldr x20, [x25], #0x8\n"
+    "ssubl v15.8h, v15.8b, v6.8b\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr s22, [x20, x11]\n"
+    "ssubl v17.8h, v17.8b, v6.8b\n"
+    "ssubl v18.8h, v18.8b, v6.8b\n"
+    "ssubl v19.8h, v19.8b, v6.8b\n"
+    "ssubl v20.8h, v20.8b, v6.8b\n"
+    "ssubl v21.8h, v21.8b, v6.8b\n"
+    "ssubl v22.8h, v22.8b, v6.8b\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "cbz %x[rq_mul_ptr], 5f\n"
+    "lsl x20, x11, #0x2\n"
+    "ldr q2, [%x[rq_mul_ptr], x20]\n"
+    "ldr q1, [%x[rq_right_shift_ptr], x20]\n"
+    "cbz %x[rq_left_shift_ptr], 5f\n"
+    "ldr q3, [%x[rq_left_shift_ptr], x20]\n"
+    "5:"  // Channel loop: Load quantisation parameters: Done
+    "sshl v23.4s, v23.4s, v3.4s\n"
+    "sshl v24.4s, v24.4s, v3.4s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "sshl v25.4s, v25.4s, v3.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "and v18.16b, v23.16b, v1.16b\n"
+    "and v17.16b, v24.16b, v1.16b\n"
+    "and v16.16b, v25.16b, v1.16b\n"
+    "sshl v26.4s, v26.4s, v3.4s\n"
+    "sshl v27.4s, v27.4s, v3.4s\n"
+    "sshl v28.4s, v28.4s, v3.4s\n"
+    "sshl v29.4s, v29.4s, v3.4s\n"
+    "sshl v30.4s, v30.4s, v3.4s\n"
+    "sshl v31.4s, v31.4s, v3.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+    "sqadd v23.4s, v23.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v21.16b, v26.16b, v1.16b\n"
+    "and v20.16b, v27.16b, v1.16b\n"
+    "and v19.16b, v28.16b, v1.16b\n"
+    "and v18.16b, v29.16b, v1.16b\n"
+    "and v17.16b, v30.16b, v1.16b\n"
+    "and v16.16b, v31.16b, v1.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v21.4s\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v23.4s, v23.4s, v1.4s\n"
+    "srshl v24.4s, v24.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "srshl v27.4s, v27.4s, v1.4s\n"
+    "srshl v28.4s, v28.4s, v1.4s\n"
+    "srshl v29.4s, v29.4s, v1.4s\n"
+    "srshl v30.4s, v30.4s, v1.4s\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "add v23.4s, v23.4s, v4.4s\n"
+    "add v24.4s, v24.4s, v4.4s\n"
+    "add v25.4s, v25.4s, v4.4s\n"
+    "add v26.4s, v26.4s, v4.4s\n"
+    "add v27.4s, v27.4s, v4.4s\n"
+    "add v28.4s, v28.4s, v4.4s\n"
+    "add v29.4s, v29.4s, v4.4s\n"
+    "add v30.4s, v30.4s, v4.4s\n"
+    "add v31.4s, v31.4s, v4.4s\n"
+    "smax v23.4s, v23.4s, v8.4s\n"
+    "smax v24.4s, v24.4s, v8.4s\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v27.4s, v27.4s, v8.4s\n"
+    "smax v28.4s, v28.4s, v8.4s\n"
+    "smax v29.4s, v29.4s, v8.4s\n"
+    "smax v30.4s, v30.4s, v8.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smin v23.4s, v23.4s, v7.4s\n"
+    "smin v24.4s, v24.4s, v7.4s\n"
+    "smin v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v7.4s\n"
+    "smin v27.4s, v27.4s, v7.4s\n"
+    "smin v28.4s, v28.4s, v7.4s\n"
+    "smin v29.4s, v29.4s, v7.4s\n"
+    "smin v30.4s, v30.4s, v7.4s\n"
+    "smin v31.4s, v31.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s23, [x28, x11]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s24, [x27, x11]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s25, [x26, x11]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s26, [x25, x11]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s27, [x24, x11]\n"
+    "str s28, [x23, x11]\n"
+    "str s29, [x22, x11]\n"
+    "str s30, [x21, x11]\n"
+    "str s31, [x20, x11]\n"
+    "add x11, x11, #0x4\n"
+    "cmp x11, x9, LSL #2\n"
+    "blt 1b\n"
+    "6:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 24f\n"
+    "movi v23.4s, #0x0\n"
+    "cbz %x[bias], 9f\n"
+    "add x20, %x[bias], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ld1 { v23.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v23.s }[2], [x20], #0x4\n"
+    "b 8f\n"
+    "7:"  // Oddments: Load bias: Bit 1: Unset
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "8:"  // Oddments: Load bias: Bit 1: End
+    "9:"  // Oddments: Load bias: Done
+    "ldr s0, [%x[params]], #0x4\n"
+    "mov x10, %x[inptrs]\n"
+    "ldp x9, x28, [x10], #0x10\n"
+    "mov v24.16b, v23.16b\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "mov v25.16b, v23.16b\n"
+    "mov v26.16b, v23.16b\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "ldr x21, [x10], #0x8\n"
+    "mov v27.16b, v23.16b\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "mov v30.16b, v23.16b\n"
+    "add x9, x9, x11\n"
+    "add x28, x28, x11\n"
+    "mov v31.16b, v23.16b\n"
+    "ssubl v0.8h, v0.8b, v5.8b\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ldr h14, [x9], #0x2\n"
+    "ldr h15, [x28], #0x2\n"
+    "ldr h16, [x27], #0x2\n"
+    "ldr h17, [x26], #0x2\n"
+    "ldr h18, [x25], #0x2\n"
+    "ldr h19, [x24], #0x2\n"
+    "ldr h20, [x23], #0x2\n"
+    "ldr h21, [x22], #0x2\n"
+    "ldr h22, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v14.b }[2], [x9], #0x1\n"
+    "ld1 { v15.b }[2], [x28], #0x1\n"
+    "ld1 { v16.b }[2], [x27], #0x1\n"
+    "ld1 { v17.b }[2], [x26], #0x1\n"
+    "ld1 { v18.b }[2], [x25], #0x1\n"
+    "ld1 { v19.b }[2], [x24], #0x1\n"
+    "ld1 { v20.b }[2], [x23], #0x1\n"
+    "ld1 { v21.b }[2], [x22], #0x1\n"
+    "ld1 { v22.b }[2], [x21], #0x1\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load: Bit 1: Unset
+    "ldr b14, [x9], #0x1\n"
+    "ldr b15, [x28], #0x1\n"
+    "ldr b16, [x27], #0x1\n"
+    "ldr b17, [x26], #0x1\n"
+    "ldr b18, [x25], #0x1\n"
+    "ldr b19, [x24], #0x1\n"
+    "ldr b20, [x23], #0x1\n"
+    "ldr b21, [x22], #0x1\n"
+    "ldr b22, [x21], #0x1\n"
+    "11:"  // Oddments: Load: Bit 1: End
+    "subs x20, %x[n_points], #0x1\n"
+    "ssubl v14.8h, v14.8b, v6.8b\n"
+    "ssubl v15.8h, v15.8b, v6.8b\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ssubl v17.8h, v17.8b, v6.8b\n"
+    "ssubl v18.8h, v18.8b, v6.8b\n"
+    "ssubl v19.8h, v19.8b, v6.8b\n"
+    "ssubl v20.8h, v20.8b, v6.8b\n"
+    "ssubl v21.8h, v21.8b, v6.8b\n"
+    "ssubl v22.8h, v22.8b, v6.8b\n"
+    "ble 15f\n"
+    "12:"  // Oddments: Planar loop
+    "ldp x9, x28, [x10], #0x10\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "ldr x21, [x10], #0x8\n"
+    "add x9, x9, x11\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "add x28, x28, x11\n"
+    "add x27, x27, x11\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "ldr s0, [%x[params]], #0x4\n"
+    "ssubl v0.8h, v0.8b, v5.8b\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr h14, [x9], #0x2\n"
+    "ldr h15, [x28], #0x2\n"
+    "ldr h16, [x27], #0x2\n"
+    "ldr h17, [x26], #0x2\n"
+    "ldr h18, [x25], #0x2\n"
+    "ldr h19, [x24], #0x2\n"
+    "ldr h20, [x23], #0x2\n"
+    "ldr h21, [x22], #0x2\n"
+    "ldr h22, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v14.b }[2], [x9], #0x1\n"
+    "ld1 { v15.b }[2], [x28], #0x1\n"
+    "ld1 { v16.b }[2], [x27], #0x1\n"
+    "ld1 { v17.b }[2], [x26], #0x1\n"
+    "ld1 { v18.b }[2], [x25], #0x1\n"
+    "ld1 { v19.b }[2], [x24], #0x1\n"
+    "ld1 { v20.b }[2], [x23], #0x1\n"
+    "ld1 { v21.b }[2], [x22], #0x1\n"
+    "ld1 { v22.b }[2], [x21], #0x1\n"
+    "b 14f\n"
+    "13:"  // Oddments: Planar loop: Load: Bit 1: Unset
+    "ldr b14, [x9], #0x1\n"
+    "ldr b15, [x28], #0x1\n"
+    "ldr b16, [x27], #0x1\n"
+    "ldr b17, [x26], #0x1\n"
+    "ldr b18, [x25], #0x1\n"
+    "ldr b19, [x24], #0x1\n"
+    "ldr b20, [x23], #0x1\n"
+    "ldr b21, [x22], #0x1\n"
+    "ldr b22, [x21], #0x1\n"
+    "14:"  // Oddments: Planar loop: Load: Bit 1: End
+    "subs x20, x20, #0x1\n"
+    "ssubl v14.8h, v14.8b, v6.8b\n"
+    "ssubl v15.8h, v15.8b, v6.8b\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ssubl v17.8h, v17.8b, v6.8b\n"
+    "ssubl v18.8h, v18.8b, v6.8b\n"
+    "ssubl v19.8h, v19.8b, v6.8b\n"
+    "ssubl v20.8h, v20.8b, v6.8b\n"
+    "ssubl v21.8h, v21.8b, v6.8b\n"
+    "ssubl v22.8h, v22.8b, v6.8b\n"
+    "bgt 12b\n"
+    "15:"  // Oddments: Planar tail
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "cbz %x[rq_mul_ptr], 21f\n"
+    "add x22, %x[rq_mul_ptr], x11, LSL #2\n"
+    "add x21, %x[rq_right_shift_ptr], x11, LSL #2\n"
+    "add x20, %x[rq_left_shift_ptr], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v2.d }[0], [x22], #0x8\n"
+    "ld1 { v1.d }[0], [x21], #0x8\n"
+    "cbz %x[rq_left_shift_ptr], 16f\n"
+    "ld1 { v3.d }[0], [x20], #0x8\n"
+    "16:"  // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v2.s }[2], [x22], #0x4\n"
+    "ld1 { v1.s }[2], [x21], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 17f\n"
+    "ld1 { v3.s }[2], [x20], #0x4\n"
+    "17:"  // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
+    "b 20f\n"
+    "18:"  // Oddments: Load quantisation parameters: Bit 1: Unset
+    "ld1 { v2.s }[0], [x22], #0x4\n"
+    "ld1 { v1.s }[0], [x21], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 19f\n"
+    "ld1 { v3.s }[0], [x20], #0x4\n"
+    "19:"  // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
+    "20:"  // Oddments: Load quantisation parameters: Bit 1: End
+    "21:"  // Oddments: Load quantisation parameters: Done
+    "sshl v23.4s, v23.4s, v3.4s\n"
+    "sshl v24.4s, v24.4s, v3.4s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "sshl v25.4s, v25.4s, v3.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "add x28, x28, x11\n"
+    "and v18.16b, v23.16b, v1.16b\n"
+    "and v17.16b, v24.16b, v1.16b\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "and v16.16b, v25.16b, v1.16b\n"
+    "sshl v26.4s, v26.4s, v3.4s\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "sshl v27.4s, v27.4s, v3.4s\n"
+    "sshl v28.4s, v28.4s, v3.4s\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "sshl v29.4s, v29.4s, v3.4s\n"
+    "sshl v30.4s, v30.4s, v3.4s\n"
+    "add x21, x21, x11\n"
+    "add x20, x20, x11\n"
+    "sshl v31.4s, v31.4s, v3.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+    "sqadd v23.4s, v23.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v21.16b, v26.16b, v1.16b\n"
+    "and v20.16b, v27.16b, v1.16b\n"
+    "and v19.16b, v28.16b, v1.16b\n"
+    "and v18.16b, v29.16b, v1.16b\n"
+    "and v17.16b, v30.16b, v1.16b\n"
+    "and v16.16b, v31.16b, v1.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v21.4s\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v23.4s, v23.4s, v1.4s\n"
+    "srshl v24.4s, v24.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "srshl v27.4s, v27.4s, v1.4s\n"
+    "srshl v28.4s, v28.4s, v1.4s\n"
+    "srshl v29.4s, v29.4s, v1.4s\n"
+    "srshl v30.4s, v30.4s, v1.4s\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "add v23.4s, v23.4s, v4.4s\n"
+    "add v24.4s, v24.4s, v4.4s\n"
+    "add v25.4s, v25.4s, v4.4s\n"
+    "add v26.4s, v26.4s, v4.4s\n"
+    "add v27.4s, v27.4s, v4.4s\n"
+    "add v28.4s, v28.4s, v4.4s\n"
+    "add v29.4s, v29.4s, v4.4s\n"
+    "add v30.4s, v30.4s, v4.4s\n"
+    "add v31.4s, v31.4s, v4.4s\n"
+    "smax v23.4s, v23.4s, v8.4s\n"
+    "smax v24.4s, v24.4s, v8.4s\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v27.4s, v27.4s, v8.4s\n"
+    "smax v28.4s, v28.4s, v8.4s\n"
+    "smax v29.4s, v29.4s, v8.4s\n"
+    "smax v30.4s, v30.4s, v8.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smin v23.4s, v23.4s, v7.4s\n"
+    "smin v24.4s, v24.4s, v7.4s\n"
+    "smin v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v7.4s\n"
+    "smin v27.4s, v27.4s, v7.4s\n"
+    "smin v28.4s, v28.4s, v7.4s\n"
+    "smin v29.4s, v29.4s, v7.4s\n"
+    "smin v30.4s, v30.4s, v7.4s\n"
+    "smin v31.4s, v31.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "st1 { v23.h }[0], [x28], #0x2\n"
+    "st1 { v24.h }[0], [x27], #0x2\n"
+    "st1 { v25.h }[0], [x26], #0x2\n"
+    "st1 { v26.h }[0], [x25], #0x2\n"
+    "st1 { v27.h }[0], [x24], #0x2\n"
+    "st1 { v28.h }[0], [x23], #0x2\n"
+    "st1 { v29.h }[0], [x22], #0x2\n"
+    "st1 { v30.h }[0], [x21], #0x2\n"
+    "st1 { v31.h }[0], [x20], #0x2\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "st1 { v23.b }[2], [x28], #0x1\n"
+    "st1 { v24.b }[2], [x27], #0x1\n"
+    "st1 { v25.b }[2], [x26], #0x1\n"
+    "st1 { v26.b }[2], [x25], #0x1\n"
+    "st1 { v27.b }[2], [x24], #0x1\n"
+    "st1 { v28.b }[2], [x23], #0x1\n"
+    "st1 { v29.b }[2], [x22], #0x1\n"
+    "st1 { v30.b }[2], [x21], #0x1\n"
+    "st1 { v31.b }[2], [x20], #0x1\n"
+    "b 23f\n"
+    "22:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v23.b }[0], [x28], #0x1\n"
+    "st1 { v24.b }[0], [x27], #0x1\n"
+    "st1 { v25.b }[0], [x26], #0x1\n"
+    "st1 { v26.b }[0], [x25], #0x1\n"
+    "st1 { v27.b }[0], [x24], #0x1\n"
+    "st1 { v28.b }[0], [x23], #0x1\n"
+    "st1 { v29.b }[0], [x22], #0x1\n"
+    "st1 { v30.b }[0], [x21], #0x1\n"
+    "st1 { v31.b }[0], [x20], #0x1\n"
+    "23:"  // Oddments: Store: Bit 1: End
+    "24:"  // End
+    : [params] "+&r" (params)
+    : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000000..14adf8880f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>;
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *)
+  : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..be8fbfa0e2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -0,0 +1,519 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "ldr q11, [%x[params], #0x0]\n"
+    "ldr q5, [%x[params], #0x10]\n"
+    "movi v8.16b, #0x1\n"
+    "ushr v8.4s, v8.4s, #0x8\n"
+    "ldr q6, [%x[params], #0x20]\n"
+    "ldr q7, [%x[params], #0x30]\n"
+    "movi v24.4s, #0x0\n"
+    "movi v25.4s, #0x0\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "ld1 { v1.16b }, [x20]\n"
+    "mov v28.16b, v1.16b\n"
+    "mov v23.16b, v1.16b\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "ld1 { v2.16b }, [x20]\n"
+    "mov v30.16b, v1.16b\n"
+    "mov v21.16b, v2.16b\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "ld1 { v4.16b }, [x20]\n"
+    "mov v20.16b, v2.16b\n"
+    "mov v29.16b, v2.16b\n"
+    "ldr x20, [%x[inptrs], #0x0]\n"
+    "ld1 { v0.16b }, [x20]\n"
+    "mov v9.16b, v4.16b\n"
+    "mov v22.16b, v4.16b\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "ld1 { v3.16b }, [x20]\n"
+    "mov v31.16b, v4.16b\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x2\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x4\n"
+    "ext v30.16b, v30.16b, v30.16b, #0x6\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v12.4s }, [x20]\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x2\n"
+    "ext v20.16b, v20.16b, v20.16b, #0x4\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "ext v29.16b, v29.16b, v29.16b, #0x6\n"
+    "ext v9.16b, v9.16b, v9.16b, #0x2\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v13.4s }, [x20]\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x4\n"
+    "ext v31.16b, v31.16b, v31.16b, #0x6\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "mov v27.16b, v0.16b\n"
+    "mov v19.16b, v0.16b\n"
+    "cmp %x[n_channels], #0x4\n"
+    "mov x9, #0x0\n"
+    "mov v18.16b, v0.16b\n"
+    "mov v26.16b, v3.16b\n"
+    "mov x28, #0x0\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "mov v17.16b, v3.16b\n"
+    "mov v16.16b, v3.16b\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x2\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x4\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "add %x[params], %x[params], #0x40\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x6\n"
+    "zip1 v1.4s, v1.4s, v23.4s\n"
+    "zip1 v28.4s, v28.4s, v30.4s\n"
+    "zip1 v2.4s, v2.4s, v20.4s\n"
+    "zip1 v21.4s, v21.4s, v29.4s\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x2\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+    "ext v16.16b, v16.16b, v16.16b, #0x6\n"
+    "zip1 v4.4s, v4.4s, v22.4s\n"
+    "zip1 v9.4s, v9.4s, v31.4s\n"
+    "zip1 v0.4s, v0.4s, v19.4s\n"
+    "zip1 v27.4s, v27.4s, v18.4s\n"
+    "zip1 v1.4s, v1.4s, v28.4s\n"
+    "zip1 v2.4s, v2.4s, v21.4s\n"
+    ".inst 0x4f81e118  // sdot v24.4s, v8.16b, v1.4b[0]\n"
+    "zip1 v3.4s, v3.4s, v17.4s\n"
+    "zip1 v26.4s, v26.4s, v16.4s\n"
+    ".inst 0x4fa1e119  // sdot v25.4s, v8.16b, v1.4b[1]\n"
+    "zip1 v4.4s, v4.4s, v9.4s\n"
+    "movi v23.4s, #0x0\n"
+    ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
+    "movi v22.4s, #0x0\n"
+    "movi v21.4s, #0x0\n"
+    ".inst 0x4fa1e916  // sdot v22.4s, v8.16b, v1.4b[3]\n"
+    "movi v19.4s, #0x0\n"
+    "movi v9.4s, #0x0\n"
+    ".inst 0x4f82e115  // sdot v21.4s, v8.16b, v2.4b[0]\n"
+    "movi v10.4s, #0x0\n"
+    "movi v20.4s, #0x0\n"
+    ".inst 0x4fa2e113  // sdot v19.4s, v8.16b, v2.4b[1]\n"
+    "movi v18.4s, #0x0\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x4f82e909  // sdot v9.4s, v8.16b, v2.4b[2]\n"
+    "movi v16.4s, #0x0\n"
+    "zip1 v0.4s, v0.4s, v27.4s\n"
+    ".inst 0x4fa2e90a  // sdot v10.4s, v8.16b, v2.4b[3]\n"
+    "zip1 v3.4s, v3.4s, v26.4s\n"
+    ".inst 0x4f84e114  // sdot v20.4s, v8.16b, v4.4b[0]\n"
+    ".inst 0x4fa4e112  // sdot v18.4s, v8.16b, v4.4b[1]\n"
+    ".inst 0x4f84e911  // sdot v17.4s, v8.16b, v4.4b[2]\n"
+    ".inst 0x4fa4e910  // sdot v16.4s, v8.16b, v4.4b[3]\n"
+    "movi v31.4s, #0x0\n"
+    "movi v30.4s, #0x0\n"
+    "movi v26.4s, #0x0\n"
+    ".inst 0x4f80e11f  // sdot v31.4s, v8.16b, v0.4b[0]\n"
+    "movi v27.4s, #0x0\n"
+    "movi v28.4s, #0x0\n"
+    ".inst 0x4fa0e11e  // sdot v30.4s, v8.16b, v0.4b[1]\n"
+    "movi v29.4s, #0x0\n"
+    ".inst 0x4f80e91a  // sdot v26.4s, v8.16b, v0.4b[2]\n"
+    ".inst 0x4fa0e91b  // sdot v27.4s, v8.16b, v0.4b[3]\n"
+    ".inst 0x4f83e11c  // sdot v28.4s, v8.16b, v3.4b[0]\n"
+    ".inst 0x4fa3e11d  // sdot v29.4s, v8.16b, v3.4b[1]\n"
+    "add v24.4s, v24.4s, v21.4s\n"
+    "add v25.4s, v25.4s, v19.4s\n"
+    "add v23.4s, v23.4s, v9.4s\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "add v21.4s, v20.4s, v21.4s\n"
+    "movi v20.4s, #0x0\n"
+    ".inst 0x4f83e914  // sdot v20.4s, v8.16b, v3.4b[2]\n"
+    "add v19.4s, v18.4s, v19.4s\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x4fa3e912  // sdot v18.4s, v8.16b, v3.4b[3]\n"
+    "add v17.4s, v17.4s, v9.4s\n"
+    "add v16.4s, v16.4s, v10.4s\n"
+    "add v24.4s, v24.4s, v31.4s\n"
+    "add v25.4s, v25.4s, v30.4s\n"
+    "add v26.4s, v23.4s, v26.4s\n"
+    "add v27.4s, v22.4s, v27.4s\n"
+    "add v28.4s, v21.4s, v28.4s\n"
+    "add v29.4s, v19.4s, v29.4s\n"
+    "add v30.4s, v17.4s, v20.4s\n"
+    "add v31.4s, v16.4s, v18.4s\n"
+    "neg v12.4s, v12.4s\n"
+    "mul v24.4s, v24.4s, v12.4s\n"
+    "mul v25.4s, v25.4s, v12.4s\n"
+    "mul v26.4s, v26.4s, v12.4s\n"
+    "mul v27.4s, v27.4s, v12.4s\n"
+    "mul v28.4s, v28.4s, v12.4s\n"
+    "mul v29.4s, v29.4s, v12.4s\n"
+    "mul v30.4s, v30.4s, v12.4s\n"
+    "mul v31.4s, v31.4s, v12.4s\n"
+    "zip1 v19.4s, v24.4s, v26.4s\n"
+    "zip1 v18.4s, v25.4s, v27.4s\n"
+    "zip1 v17.4s, v28.4s, v30.4s\n"
+    "zip1 v16.4s, v29.4s, v31.4s\n"
+    "zip1 v22.4s, v19.4s, v18.4s\n"
+    "zip1 v23.4s, v17.4s, v16.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "ble 2f\n"
+    "1:"  // Loop
+    "ldr q8, [%x[params], #0x0]\n"
+    "ldr q21, [%x[params], #0x10]\n"
+    ".inst 0x4f80e0b8  // sdot v24.4s, v5.16b, v0.4b[0]\n"
+    ".inst 0x4fa0e0b9  // sdot v25.4s, v5.16b, v0.4b[1]\n"
+    "ldr q20, [%x[params], #0x20]\n"
+    ".inst 0x4f80e8ba  // sdot v26.4s, v5.16b, v0.4b[2]\n"
+    ".inst 0x4fa0e8bb  // sdot v27.4s, v5.16b, v0.4b[3]\n"
+    "sub %x[n_channels], %x[n_channels], #0x4\n"
+    ".inst 0x4f81e0d8  // sdot v24.4s, v6.16b, v1.4b[0]\n"
+    ".inst 0x4fa1e0d9  // sdot v25.4s, v6.16b, v1.4b[1]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "add x9, x9, #0x10\n"
+    ".inst 0x4f81e8da  // sdot v26.4s, v6.16b, v1.4b[2]\n"
+    ".inst 0x4fa1e8db  // sdot v27.4s, v6.16b, v1.4b[3]\n"
+    ".inst 0x4f82e0bc  // sdot v28.4s, v5.16b, v2.4b[0]\n"
+    ".inst 0x4fa2e0bd  // sdot v29.4s, v5.16b, v2.4b[1]\n"
+    ".inst 0x4f82e8be  // sdot v30.4s, v5.16b, v2.4b[2]\n"
+    ".inst 0x4fa2e8bf  // sdot v31.4s, v5.16b, v2.4b[3]\n"
+    "ldr q5, [%x[params], #0x30]\n"
+    ".inst 0x4f82e0f8  // sdot v24.4s, v7.16b, v2.4b[0]\n"
+    ".inst 0x4fa2e0f9  // sdot v25.4s, v7.16b, v2.4b[1]\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    ".inst 0x4f82e8fa  // sdot v26.4s, v7.16b, v2.4b[2]\n"
+    ".inst 0x4fa2e8fb  // sdot v27.4s, v7.16b, v2.4b[3]\n"
+    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+    ".inst 0x4f83e0dc  // sdot v28.4s, v6.16b, v3.4b[0]\n"
+    ".inst 0x4fa3e0dd  // sdot v29.4s, v6.16b, v3.4b[1]\n"
+    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+    ".inst 0x4f83e8de  // sdot v30.4s, v6.16b, v3.4b[2]\n"
+    ".inst 0x4fa3e8df  // sdot v31.4s, v6.16b, v3.4b[3]\n"
+    "ldr q6, [%x[params], #0x40]\n"
+    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+    ".inst 0x4f84e0fc  // sdot v28.4s, v7.16b, v4.4b[0]\n"
+    ".inst 0x4fa4e0fd  // sdot v29.4s, v7.16b, v4.4b[1]\n"
+    "and v19.16b, v24.16b, v21.16b\n"
+    ".inst 0x4f84e8fe  // sdot v30.4s, v7.16b, v4.4b[2]\n"
+    ".inst 0x4fa4e8ff  // sdot v31.4s, v7.16b, v4.4b[3]\n"
+    "ldr q7, [%x[params], #0x50]\n"
+    "and v18.16b, v25.16b, v21.16b\n"
+    "and v17.16b, v26.16b, v21.16b\n"
+    "and v16.16b, v27.16b, v21.16b\n"
+    "add %x[params], %x[params], #0x60\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+    "sqadd v24.4s, v24.4s, v19.4s\n"
+    "sqadd v25.4s, v25.4s, v18.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "and v19.16b, v28.16b, v21.16b\n"
+    "and v18.16b, v29.16b, v21.16b\n"
+    "and v17.16b, v30.16b, v21.16b\n"
+    "and v16.16b, v31.16b, v21.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v24.4s, v24.4s, v21.4s\n"
+    "srshl v25.4s, v25.4s, v21.4s\n"
+    "srshl v26.4s, v26.4s, v21.4s\n"
+    "srshl v27.4s, v27.4s, v21.4s\n"
+    "srshl v28.4s, v28.4s, v21.4s\n"
+    "srshl v29.4s, v29.4s, v21.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v31.4s, v31.4s, v21.4s\n"
+    "add v24.4s, v24.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "add v27.4s, v27.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "smin v24.4s, v24.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v15.4s\n"
+    "smin v26.4s, v26.4s, v15.4s\n"
+    "smin v27.4s, v27.4s, v15.4s\n"
+    "smin v28.4s, v28.4s, v15.4s\n"
+    "smin v29.4s, v29.4s, v15.4s\n"
+    "smin v30.4s, v30.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v13.4s\n"
+    "smax v25.4s, v25.4s, v13.4s\n"
+    "smax v26.4s, v26.4s, v13.4s\n"
+    "smax v27.4s, v27.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s24, [x27, x28]\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s25, [x26, x28]\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s26, [x25, x28]\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s27, [x24, x28]\n"
+    "str s28, [x23, x28]\n"
+    "dup v24.4s, v22.s[0]\n"
+    "dup v25.4s, v22.s[1]\n"
+    "str s29, [x22, x28]\n"
+    "dup v26.4s, v22.s[2]\n"
+    "dup v27.4s, v22.s[3]\n"
+    "str s30, [x21, x28]\n"
+    "dup v28.4s, v23.s[0]\n"
+    "dup v29.4s, v23.s[1]\n"
+    "str s31, [x20, x28]\n"
+    "dup v30.4s, v23.s[2]\n"
+    "dup v31.4s, v23.s[3]\n"
+    "add x28, x28, #0x4\n"
+    "add v24.4s, v24.4s, v20.4s\n"
+    "add v25.4s, v25.4s, v20.4s\n"
+    "add v26.4s, v26.4s, v20.4s\n"
+    "add v27.4s, v27.4s, v20.4s\n"
+    "add v28.4s, v28.4s, v20.4s\n"
+    "add v29.4s, v29.4s, v20.4s\n"
+    "add v30.4s, v30.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v20.4s\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q21, [%x[params], #0x0]\n"
+    "ldr q20, [%x[params], #0x10]\n"
+    ".inst 0x4f80e0b8  // sdot v24.4s, v5.16b, v0.4b[0]\n"
+    ".inst 0x4fa0e0b9  // sdot v25.4s, v5.16b, v0.4b[1]\n"
+    ".inst 0x4f80e8ba  // sdot v26.4s, v5.16b, v0.4b[2]\n"
+    ".inst 0x4fa0e8bb  // sdot v27.4s, v5.16b, v0.4b[3]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "add x27, x27, x28\n"
+    ".inst 0x4f81e0d8  // sdot v24.4s, v6.16b, v1.4b[0]\n"
+    ".inst 0x4fa1e0d9  // sdot v25.4s, v6.16b, v1.4b[1]\n"
+    "add x26, x26, x28\n"
+    "add x25, x25, x28\n"
+    ".inst 0x4f81e8da  // sdot v26.4s, v6.16b, v1.4b[2]\n"
+    ".inst 0x4fa1e8db  // sdot v27.4s, v6.16b, v1.4b[3]\n"
+    "add x24, x24, x28\n"
+    "add x23, x23, x28\n"
+    ".inst 0x4f82e0bc  // sdot v28.4s, v5.16b, v2.4b[0]\n"
+    ".inst 0x4fa2e0bd  // sdot v29.4s, v5.16b, v2.4b[1]\n"
+    "add x22, x22, x28\n"
+    "add x21, x21, x28\n"
+    ".inst 0x4f82e8be  // sdot v30.4s, v5.16b, v2.4b[2]\n"
+    ".inst 0x4fa2e8bf  // sdot v31.4s, v5.16b, v2.4b[3]\n"
+    "add x20, x20, x28\n"
+    "add %x[params], %x[params], #0x20\n"
+    ".inst 0x4f82e0f8  // sdot v24.4s, v7.16b, v2.4b[0]\n"
+    ".inst 0x4fa2e0f9  // sdot v25.4s, v7.16b, v2.4b[1]\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    ".inst 0x4f82e8fa  // sdot v26.4s, v7.16b, v2.4b[2]\n"
+    ".inst 0x4fa2e8fb  // sdot v27.4s, v7.16b, v2.4b[3]\n"
+    "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+    ".inst 0x4f83e0dc  // sdot v28.4s, v6.16b, v3.4b[0]\n"
+    ".inst 0x4fa3e0dd  // sdot v29.4s, v6.16b, v3.4b[1]\n"
+    "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+    ".inst 0x4f83e8de  // sdot v30.4s, v6.16b, v3.4b[2]\n"
+    ".inst 0x4fa3e8df  // sdot v31.4s, v6.16b, v3.4b[3]\n"
+    "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+    ".inst 0x4f84e0fc  // sdot v28.4s, v7.16b, v4.4b[0]\n"
+    ".inst 0x4fa4e0fd  // sdot v29.4s, v7.16b, v4.4b[1]\n"
+    "and v19.16b, v24.16b, v20.16b\n"
+    ".inst 0x4f84e8fe  // sdot v30.4s, v7.16b, v4.4b[2]\n"
+    ".inst 0x4fa4e8ff  // sdot v31.4s, v7.16b, v4.4b[3]\n"
+    "and v18.16b, v25.16b, v20.16b\n"
+    "and v17.16b, v26.16b, v20.16b\n"
+    "and v16.16b, v27.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v19.4s\n"
+    "sqadd v25.4s, v25.4s, v18.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "and v19.16b, v28.16b, v20.16b\n"
+    "and v18.16b, v29.16b, v20.16b\n"
+    "and v17.16b, v30.16b, v20.16b\n"
+    "and v16.16b, v31.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v24.4s, v24.4s, v20.4s\n"
+    "srshl v25.4s, v25.4s, v20.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "srshl v27.4s, v27.4s, v20.4s\n"
+    "srshl v28.4s, v28.4s, v20.4s\n"
+    "srshl v29.4s, v29.4s, v20.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "add v24.4s, v24.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "add v27.4s, v27.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "smin v24.4s, v24.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v15.4s\n"
+    "smin v26.4s, v26.4s, v15.4s\n"
+    "smin v27.4s, v27.4s, v15.4s\n"
+    "smin v28.4s, v28.4s, v15.4s\n"
+    "smin v29.4s, v29.4s, v15.4s\n"
+    "smin v30.4s, v30.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v13.4s\n"
+    "smax v25.4s, v25.4s, v13.4s\n"
+    "smax v26.4s, v26.4s, v13.4s\n"
+    "smax v27.4s, v27.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "blt 3f\n"
+    "str s24, [x27, #0x0]\n"
+    "str s25, [x26, #0x0]\n"
+    "str s26, [x25, #0x0]\n"
+    "str s27, [x24, #0x0]\n"
+    "str s28, [x23, #0x0]\n"
+    "str s29, [x22, #0x0]\n"
+    "str s30, [x21, #0x0]\n"
+    "str s31, [x20, #0x0]\n"
+    "b 4f\n"
+    "3:"  // Tail: Oddments
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[0], [x27], #0x1\n"
+    "st1 { v25.b }[0], [x26], #0x1\n"
+    "st1 { v26.b }[0], [x25], #0x1\n"
+    "st1 { v27.b }[0], [x24], #0x1\n"
+    "st1 { v28.b }[0], [x23], #0x1\n"
+    "st1 { v29.b }[0], [x22], #0x1\n"
+    "st1 { v30.b }[0], [x21], #0x1\n"
+    "st1 { v31.b }[0], [x20], #0x1\n"
+    "beq 4f\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[1], [x27], #0x1\n"
+    "st1 { v25.b }[1], [x26], #0x1\n"
+    "st1 { v26.b }[1], [x25], #0x1\n"
+    "st1 { v27.b }[1], [x24], #0x1\n"
+    "st1 { v28.b }[1], [x23], #0x1\n"
+    "st1 { v29.b }[1], [x22], #0x1\n"
+    "st1 { v30.b }[1], [x21], #0x1\n"
+    "st1 { v31.b }[1], [x20], #0x1\n"
+    "beq 4f\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[2], [x27], #0x1\n"
+    "st1 { v25.b }[2], [x26], #0x1\n"
+    "st1 { v26.b }[2], [x25], #0x1\n"
+    "st1 { v27.b }[2], [x24], #0x1\n"
+    "st1 { v28.b }[2], [x23], #0x1\n"
+    "st1 { v29.b }[2], [x22], #0x1\n"
+    "st1 { v30.b }[2], [x21], #0x1\n"
+    "st1 { v31.b }[2], [x20], #0x1\n"
+    "beq 4f\n"
+    "st1 { v24.b }[3], [x27], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v25.b }[3], [x26], #0x1\n"
+    "st1 { v26.b }[3], [x25], #0x1\n"
+    "st1 { v27.b }[3], [x24], #0x1\n"
+    "st1 { v28.b }[3], [x23], #0x1\n"
+    "st1 { v29.b }[3], [x22], #0x1\n"
+    "st1 { v30.b }[3], [x21], #0x1\n"
+    "st1 { v31.b }[3], [x20], #0x1\n"
+    "4:"  // Tail: End
+    : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..62b033f48d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>;
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *)
+  : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..17afc92e30
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,640 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "ldr q12, [%x[params], #0x0]\n"
+    "ldr q8, [%x[params], #0x10]\n"
+    "movi v30.16b, #0x1\n"
+    "movi v17.4s, #0x0\n"
+    "ldr q9, [%x[params], #0x20]\n"
+    "ldr q10, [%x[params], #0x30]\n"
+    "movi v16.4s, #0x0\n"
+    "movi v25.4s, #0x0\n"
+    "ldr q11, [%x[params], #0x40]\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "movi v24.4s, #0x0\n"
+    "movi v31.4s, #0x0\n"
+    "ld1 { v3.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "mov v26.16b, v3.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "ld1 { v4.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "mov v21.16b, v4.16b\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+    "ld1 { v2.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "mov v27.16b, v2.16b\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    "ld1 { v1.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x28]\n"
+    "zip1 v3.2d, v3.2d, v26.2d\n"
+    "zip1 v4.2d, v4.2d, v21.2d\n"
+    "ld1 { v5.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x30]\n"
+    "mov v26.16b, v1.16b\n"
+    "mov v22.16b, v5.16b\n"
+    "ld1 { v6.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x38]\n"
+    "mov v19.16b, v6.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "ld1 { v7.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x0]\n"
+    "mov v21.16b, v7.16b\n"
+    "zip1 v2.2d, v2.2d, v27.2d\n"
+    "ld1 { v0.16b }, [x20]\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+    ".inst 0x4f83e3d1  // sdot v17.4s, v30.16b, v3.4b[0]\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+    ".inst 0x4f83ebd0  // sdot v16.4s, v30.16b, v3.4b[2]\n"
+    ".inst 0x4f84e3d9  // sdot v25.4s, v30.16b, v4.4b[0]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v23.4s }, [x20]\n"
+    ".inst 0x4f84ebd8  // sdot v24.4s, v30.16b, v4.4b[2]\n"
+    "mov v18.16b, v0.16b\n"
+    ".inst 0x4f82e3df  // sdot v31.4s, v30.16b, v2.4b[0]\n"
+    "movi v29.4s, #0x0\n"
+    "movi v28.4s, #0x1\n"
+    ".inst 0x4f82ebdd  // sdot v29.4s, v30.16b, v2.4b[2]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v13.4s }, [x20]\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+    "zip1 v1.2d, v1.2d, v26.2d\n"
+    ".inst 0x4fa3e391  // sdot v17.4s, v28.16b, v3.4b[1]\n"
+    "zip1 v5.2d, v5.2d, v22.2d\n"
+    "zip1 v6.2d, v6.2d, v19.2d\n"
+    ".inst 0x4fa3eb90  // sdot v16.4s, v28.16b, v3.4b[3]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "zip1 v7.2d, v7.2d, v21.2d\n"
+    "movi v22.4s, #0x0\n"
+    ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+    "movi v21.4s, #0x0\n"
+    ".inst 0x4fa4eb98  // sdot v24.4s, v28.16b, v4.4b[3]\n"
+    ".inst 0x4f81e3d6  // sdot v22.4s, v30.16b, v1.4b[0]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "movi v26.4s, #0x0\n"
+    "movi v27.4s, #0x0\n"
+    ".inst 0x4f81ebd5  // sdot v21.4s, v30.16b, v1.4b[2]\n"
+    "movi v20.4s, #0x0\n"
+    "movi v19.4s, #0x0\n"
+    ".inst 0x4f85e3da  // sdot v26.4s, v30.16b, v5.4b[0]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "zip1 v0.2d, v0.2d, v18.2d\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x4f85ebdb  // sdot v27.4s, v30.16b, v5.4b[2]\n"
+    "mov x9, #0x0\n"
+    ".inst 0x4f86e3d4  // sdot v20.4s, v30.16b, v6.4b[0]\n"
+    ".inst 0x4f86ebd3  // sdot v19.4s, v30.16b, v6.4b[2]\n"
+    "add v17.4s, v17.4s, v25.4s\n"
+    "mov x28, #0x0\n"
+    "movi v25.4s, #0x0\n"
+    ".inst 0x4f87e3d2  // sdot v18.4s, v30.16b, v7.4b[0]\n"
+    ".inst 0x4f87ebd9  // sdot v25.4s, v30.16b, v7.4b[2]\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    ".inst 0x4fa2e39f  // sdot v31.4s, v28.16b, v2.4b[1]\n"
+    ".inst 0x4fa2eb9d  // sdot v29.4s, v28.16b, v2.4b[3]\n"
+    "add v16.4s, v16.4s, v24.4s\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "movi v24.4s, #0x0\n"
+    ".inst 0x4f80e3d8  // sdot v24.4s, v30.16b, v0.4b[0]\n"
+    ".inst 0x4fa1e396  // sdot v22.4s, v28.16b, v1.4b[1]\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    ".inst 0x4fa1eb95  // sdot v21.4s, v28.16b, v1.4b[3]\n"
+    ".inst 0x4fa5e39a  // sdot v26.4s, v28.16b, v5.4b[1]\n"
+    "add v31.4s, v31.4s, v17.4s\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    ".inst 0x4fa5eb9b  // sdot v27.4s, v28.16b, v5.4b[3]\n"
+    ".inst 0x4fa6e394  // sdot v20.4s, v28.16b, v6.4b[1]\n"
+    "add v29.4s, v29.4s, v16.4s\n"
+    "add %x[params], %x[params], #0x50\n"
+    ".inst 0x4fa6eb93  // sdot v19.4s, v28.16b, v6.4b[3]\n"
+    ".inst 0x4fa7e392  // sdot v18.4s, v28.16b, v7.4b[1]\n"
+    "add v22.4s, v22.4s, v31.4s\n"
+    ".inst 0x4fa7eb99  // sdot v25.4s, v28.16b, v7.4b[3]\n"
+    ".inst 0x4fa0e398  // sdot v24.4s, v28.16b, v0.4b[1]\n"
+    "add v21.4s, v21.4s, v29.4s\n"
+    "add v20.4s, v26.4s, v20.4s\n"
+    "add v19.4s, v27.4s, v19.4s\n"
+    "add v18.4s, v18.4s, v17.4s\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x4f80ebd1  // sdot v17.4s, v30.16b, v0.4b[2]\n"
+    ".inst 0x4fa0eb91  // sdot v17.4s, v28.16b, v0.4b[3]\n"
+    "add v16.4s, v25.4s, v16.4s\n"
+    "add v24.4s, v22.4s, v24.4s\n"
+    "add v25.4s, v21.4s, v17.4s\n"
+    "add v26.4s, v26.4s, v22.4s\n"
+    "add v27.4s, v27.4s, v21.4s\n"
+    "add v28.4s, v20.4s, v31.4s\n"
+    "add v29.4s, v19.4s, v29.4s\n"
+    "add v30.4s, v20.4s, v18.4s\n"
+    "add v31.4s, v19.4s, v16.4s\n"
+    "neg v23.4s, v23.4s\n"
+    "mul v24.4s, v24.4s, v23.4s\n"
+    "mul v25.4s, v25.4s, v23.4s\n"
+    "mul v26.4s, v26.4s, v23.4s\n"
+    "mul v27.4s, v27.4s, v23.4s\n"
+    "mul v28.4s, v28.4s, v23.4s\n"
+    "mul v29.4s, v29.4s, v23.4s\n"
+    "mul v30.4s, v30.4s, v23.4s\n"
+    "mul v31.4s, v31.4s, v23.4s\n"
+    "zip1 v19.4s, v24.4s, v26.4s\n"
+    "zip1 v18.4s, v25.4s, v27.4s\n"
+    "zip1 v17.4s, v28.4s, v30.4s\n"
+    "zip1 v16.4s, v29.4s, v31.4s\n"
+    "zip1 v22.4s, v19.4s, v18.4s\n"
+    "zip1 v23.4s, v17.4s, v16.4s\n"
+    "add v24.4s, v24.4s, v12.4s\n"
+    "add v25.4s, v25.4s, v12.4s\n"
+    "add v26.4s, v26.4s, v12.4s\n"
+    "add v27.4s, v27.4s, v12.4s\n"
+    "add v28.4s, v28.4s, v12.4s\n"
+    "add v29.4s, v29.4s, v12.4s\n"
+    "add v30.4s, v30.4s, v12.4s\n"
+    "add v31.4s, v31.4s, v12.4s\n"
+    "ble 2f\n"
+    "1:"  // Loop
+    "ldr q12, [%x[params], #0x60]\n"
+    "ldr q21, [%x[params], #0x70]\n"
+    ".inst 0x4f80e118  // sdot v24.4s, v8.16b, v0.4b[0]\n"
+    ".inst 0x4f80e919  // sdot v25.4s, v8.16b, v0.4b[2]\n"
+    "ldr q20, [%x[params], #0x80]\n"
+    ".inst 0x4f81e11a  // sdot v26.4s, v8.16b, v1.4b[0]\n"
+    ".inst 0x4f81e91b  // sdot v27.4s, v8.16b, v1.4b[2]\n"
+    "sub %x[n_channels], %x[n_channels], #0x4\n"
+    ".inst 0x4fa0e138  // sdot v24.4s, v9.16b, v0.4b[1]\n"
+    ".inst 0x4fa0e939  // sdot v25.4s, v9.16b, v0.4b[3]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "add x9, x9, #0x10\n"
+    ".inst 0x4fa1e13a  // sdot v26.4s, v9.16b, v1.4b[1]\n"
+    ".inst 0x4fa1e93b  // sdot v27.4s, v9.16b, v1.4b[3]\n"
+    ".inst 0x4f82e11c  // sdot v28.4s, v8.16b, v2.4b[0]\n"
+    ".inst 0x4f82e91d  // sdot v29.4s, v8.16b, v2.4b[2]\n"
+    ".inst 0x4f83e11e  // sdot v30.4s, v8.16b, v3.4b[0]\n"
+    ".inst 0x4f83e91f  // sdot v31.4s, v8.16b, v3.4b[2]\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    ".inst 0x4f81e158  // sdot v24.4s, v10.16b, v1.4b[0]\n"
+    ".inst 0x4f81e959  // sdot v25.4s, v10.16b, v1.4b[2]\n"
+    ".inst 0x4f82e15a  // sdot v26.4s, v10.16b, v2.4b[0]\n"
+    ".inst 0x4f82e95b  // sdot v27.4s, v10.16b, v2.4b[2]\n"
+    ".inst 0x4fa2e13c  // sdot v28.4s, v9.16b, v2.4b[1]\n"
+    ".inst 0x4fa2e93d  // sdot v29.4s, v9.16b, v2.4b[3]\n"
+    ".inst 0x4fa3e13e  // sdot v30.4s, v9.16b, v3.4b[1]\n"
+    ".inst 0x4fa3e93f  // sdot v31.4s, v9.16b, v3.4b[3]\n"
+    "ldr q16, [%x[params], #0x10]\n"
+    ".inst 0x4fa1e178  // sdot v24.4s, v11.16b, v1.4b[1]\n"
+    ".inst 0x4fa1e979  // sdot v25.4s, v11.16b, v1.4b[3]\n"
+    ".inst 0x4fa2e17a  // sdot v26.4s, v11.16b, v2.4b[1]\n"
+    ".inst 0x4fa2e97b  // sdot v27.4s, v11.16b, v2.4b[3]\n"
+    ".inst 0x4f83e15c  // sdot v28.4s, v10.16b, v3.4b[0]\n"
+    ".inst 0x4f83e95d  // sdot v29.4s, v10.16b, v3.4b[2]\n"
+    ".inst 0x4f84e15e  // sdot v30.4s, v10.16b, v4.4b[0]\n"
+    ".inst 0x4f84e95f  // sdot v31.4s, v10.16b, v4.4b[2]\n"
+    "ldr q19, [%x[params], #0x20]\n"
+    ".inst 0x4f82e238  // sdot v24.4s, v17.16b, v2.4b[0]\n"
+    ".inst 0x4f82ea39  // sdot v25.4s, v17.16b, v2.4b[2]\n"
+    ".inst 0x4f83e23a  // sdot v26.4s, v17.16b, v3.4b[0]\n"
+    ".inst 0x4f83ea3b  // sdot v27.4s, v17.16b, v3.4b[2]\n"
+    ".inst 0x4fa3e17c  // sdot v28.4s, v11.16b, v3.4b[1]\n"
+    ".inst 0x4fa3e97d  // sdot v29.4s, v11.16b, v3.4b[3]\n"
+    ".inst 0x4fa4e17e  // sdot v30.4s, v11.16b, v4.4b[1]\n"
+    ".inst 0x4fa4e97f  // sdot v31.4s, v11.16b, v4.4b[3]\n"
+    "ldr q18, [%x[params], #0x30]\n"
+    ".inst 0x4fa2e218  // sdot v24.4s, v16.16b, v2.4b[1]\n"
+    ".inst 0x4fa2ea19  // sdot v25.4s, v16.16b, v2.4b[3]\n"
+    ".inst 0x4fa3e21a  // sdot v26.4s, v16.16b, v3.4b[1]\n"
+    ".inst 0x4fa3ea1b  // sdot v27.4s, v16.16b, v3.4b[3]\n"
+    ".inst 0x4f84e23c  // sdot v28.4s, v17.16b, v4.4b[0]\n"
+    ".inst 0x4f84ea3d  // sdot v29.4s, v17.16b, v4.4b[2]\n"
+    ".inst 0x4f85e23e  // sdot v30.4s, v17.16b, v5.4b[0]\n"
+    ".inst 0x4f85ea3f  // sdot v31.4s, v17.16b, v5.4b[2]\n"
+    "ldr q17, [%x[params], #0x40]\n"
+    ".inst 0x4f83e278  // sdot v24.4s, v19.16b, v3.4b[0]\n"
+    ".inst 0x4f83ea79  // sdot v25.4s, v19.16b, v3.4b[2]\n"
+    ".inst 0x4f84e27a  // sdot v26.4s, v19.16b, v4.4b[0]\n"
+    ".inst 0x4f84ea7b  // sdot v27.4s, v19.16b, v4.4b[2]\n"
+    ".inst 0x4fa4e21c  // sdot v28.4s, v16.16b, v4.4b[1]\n"
+    ".inst 0x4fa4ea1d  // sdot v29.4s, v16.16b, v4.4b[3]\n"
+    ".inst 0x4fa5e21e  // sdot v30.4s, v16.16b, v5.4b[1]\n"
+    ".inst 0x4fa5ea1f  // sdot v31.4s, v16.16b, v5.4b[3]\n"
+    "ldr q16, [%x[params], #0x50]\n"
+    ".inst 0x4fa3e258  // sdot v24.4s, v18.16b, v3.4b[1]\n"
+    ".inst 0x4fa3ea59  // sdot v25.4s, v18.16b, v3.4b[3]\n"
+    ".inst 0x4fa4e25a  // sdot v26.4s, v18.16b, v4.4b[1]\n"
+    ".inst 0x4fa4ea5b  // sdot v27.4s, v18.16b, v4.4b[3]\n"
+    ".inst 0x4f85e27c  // sdot v28.4s, v19.16b, v5.4b[0]\n"
+    ".inst 0x4f85ea7d  // sdot v29.4s, v19.16b, v5.4b[2]\n"
+    ".inst 0x4f86e27e  // sdot v30.4s, v19.16b, v6.4b[0]\n"
+    ".inst 0x4f86ea7f  // sdot v31.4s, v19.16b, v6.4b[2]\n"
+    "ldr q10, [%x[params], #0xb0]\n"
+    ".inst 0x4f84e238  // sdot v24.4s, v17.16b, v4.4b[0]\n"
+    ".inst 0x4f84ea39  // sdot v25.4s, v17.16b, v4.4b[2]\n"
+    ".inst 0x4f85e23a  // sdot v26.4s, v17.16b, v5.4b[0]\n"
+    ".inst 0x4f85ea3b  // sdot v27.4s, v17.16b, v5.4b[2]\n"
+    ".inst 0x4fa5e25c  // sdot v28.4s, v18.16b, v5.4b[1]\n"
+    ".inst 0x4fa5ea5d  // sdot v29.4s, v18.16b, v5.4b[3]\n"
+    ".inst 0x4fa6e25e  // sdot v30.4s, v18.16b, v6.4b[1]\n"
+    ".inst 0x4fa6ea5f  // sdot v31.4s, v18.16b, v6.4b[3]\n"
+    "ldr q11, [%x[params], #0xc0]\n"
+    ".inst 0x4fa4e218  // sdot v24.4s, v16.16b, v4.4b[1]\n"
+    ".inst 0x4fa4ea19  // sdot v25.4s, v16.16b, v4.4b[3]\n"
+    "sqrdmulh v24.4s, v24.4s, v12.4s\n"
+    ".inst 0x4fa5e21a  // sdot v26.4s, v16.16b, v5.4b[1]\n"
+    ".inst 0x4fa5ea1b  // sdot v27.4s, v16.16b, v5.4b[3]\n"
+    "sqrdmulh v25.4s, v25.4s, v12.4s\n"
+    ".inst 0x4f86e23c  // sdot v28.4s, v17.16b, v6.4b[0]\n"
+    ".inst 0x4f86ea3d  // sdot v29.4s, v17.16b, v6.4b[2]\n"
+    "sqrdmulh v26.4s, v26.4s, v12.4s\n"
+    ".inst 0x4f87e23e  // sdot v30.4s, v17.16b, v7.4b[0]\n"
+    ".inst 0x4f87ea3f  // sdot v31.4s, v17.16b, v7.4b[2]\n"
+    "ldr q8, [%x[params], #0x90]\n"
+    "sqrdmulh v27.4s, v27.4s, v12.4s\n"
+    ".inst 0x4fa6e21c  // sdot v28.4s, v16.16b, v6.4b[1]\n"
+    ".inst 0x4fa6ea1d  // sdot v29.4s, v16.16b, v6.4b[3]\n"
+    "and v19.16b, v24.16b, v21.16b\n"
+    ".inst 0x4fa7e21e  // sdot v30.4s, v16.16b, v7.4b[1]\n"
+    ".inst 0x4fa7ea1f  // sdot v31.4s, v16.16b, v7.4b[3]\n"
+    "ldr q9, [%x[params], #0xa0]\n"
+    "and v18.16b, v25.16b, v21.16b\n"
+    "and v17.16b, v26.16b, v21.16b\n"
+    "and v16.16b, v27.16b, v21.16b\n"
+    "add %x[params], %x[params], #0xd0\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v12.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v12.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v12.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v12.4s\n"
+    "sqadd v24.4s, v24.4s, v19.4s\n"
+    "sqadd v25.4s, v25.4s, v18.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "and v19.16b, v28.16b, v21.16b\n"
+    "and v18.16b, v29.16b, v21.16b\n"
+    "and v17.16b, v30.16b, v21.16b\n"
+    "and v16.16b, v31.16b, v21.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v24.4s, v24.4s, v21.4s\n"
+    "srshl v25.4s, v25.4s, v21.4s\n"
+    "srshl v26.4s, v26.4s, v21.4s\n"
+    "srshl v27.4s, v27.4s, v21.4s\n"
+    "srshl v28.4s, v28.4s, v21.4s\n"
+    "srshl v29.4s, v29.4s, v21.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v31.4s, v31.4s, v21.4s\n"
+    "add v24.4s, v24.4s, v13.4s\n"
+    "add v25.4s, v25.4s, v13.4s\n"
+    "add v26.4s, v26.4s, v13.4s\n"
+    "add v27.4s, v27.4s, v13.4s\n"
+    "add v28.4s, v28.4s, v13.4s\n"
+    "add v29.4s, v29.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "add v31.4s, v31.4s, v13.4s\n"
+    "smin v24.4s, v24.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v15.4s\n"
+    "smin v26.4s, v26.4s, v15.4s\n"
+    "smin v27.4s, v27.4s, v15.4s\n"
+    "smin v28.4s, v28.4s, v15.4s\n"
+    "smin v29.4s, v29.4s, v15.4s\n"
+    "smin v30.4s, v30.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s24, [x27, x28]\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s25, [x26, x28]\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s26, [x25, x28]\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s27, [x24, x28]\n"
+    "str s28, [x23, x28]\n"
+    "dup v24.4s, v22.s[0]\n"
+    "dup v25.4s, v22.s[1]\n"
+    "str s29, [x22, x28]\n"
+    "dup v26.4s, v22.s[2]\n"
+    "dup v27.4s, v22.s[3]\n"
+    "str s30, [x21, x28]\n"
+    "dup v28.4s, v23.s[0]\n"
+    "dup v29.4s, v23.s[1]\n"
+    "str s31, [x20, x28]\n"
+    "dup v30.4s, v23.s[2]\n"
+    "dup v31.4s, v23.s[3]\n"
+    "add x28, x28, #0x4\n"
+    "add v24.4s, v24.4s, v20.4s\n"
+    "add v25.4s, v25.4s, v20.4s\n"
+    "add v26.4s, v26.4s, v20.4s\n"
+    "add v27.4s, v27.4s, v20.4s\n"
+    "add v28.4s, v28.4s, v20.4s\n"
+    "add v29.4s, v29.4s, v20.4s\n"
+    "add v30.4s, v30.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v20.4s\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q21, [%x[params], #0x60]\n"
+    "ldr q20, [%x[params], #0x70]\n"
+    ".inst 0x4f80e118  // sdot v24.4s, v8.16b, v0.4b[0]\n"
+    ".inst 0x4f80e919  // sdot v25.4s, v8.16b, v0.4b[2]\n"
+    ".inst 0x4f81e11a  // sdot v26.4s, v8.16b, v1.4b[0]\n"
+    ".inst 0x4f81e91b  // sdot v27.4s, v8.16b, v1.4b[2]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "add x27, x27, x28\n"
+    ".inst 0x4fa0e138  // sdot v24.4s, v9.16b, v0.4b[1]\n"
+    ".inst 0x4fa0e939  // sdot v25.4s, v9.16b, v0.4b[3]\n"
+    "add x26, x26, x28\n"
+    "add x25, x25, x28\n"
+    ".inst 0x4fa1e13a  // sdot v26.4s, v9.16b, v1.4b[1]\n"
+    ".inst 0x4fa1e93b  // sdot v27.4s, v9.16b, v1.4b[3]\n"
+    "add x24, x24, x28\n"
+    "add x23, x23, x28\n"
+    ".inst 0x4f82e11c  // sdot v28.4s, v8.16b, v2.4b[0]\n"
+    ".inst 0x4f82e91d  // sdot v29.4s, v8.16b, v2.4b[2]\n"
+    "add x22, x22, x28\n"
+    "add x21, x21, x28\n"
+    ".inst 0x4f83e11e  // sdot v30.4s, v8.16b, v3.4b[0]\n"
+    ".inst 0x4f83e91f  // sdot v31.4s, v8.16b, v3.4b[2]\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    "add x20, x20, x28\n"
+    ".inst 0x4f81e158  // sdot v24.4s, v10.16b, v1.4b[0]\n"
+    ".inst 0x4f81e959  // sdot v25.4s, v10.16b, v1.4b[2]\n"
+    ".inst 0x4f82e15a  // sdot v26.4s, v10.16b, v2.4b[0]\n"
+    ".inst 0x4f82e95b  // sdot v27.4s, v10.16b, v2.4b[2]\n"
+    ".inst 0x4fa2e13c  // sdot v28.4s, v9.16b, v2.4b[1]\n"
+    ".inst 0x4fa2e93d  // sdot v29.4s, v9.16b, v2.4b[3]\n"
+    ".inst 0x4fa3e13e  // sdot v30.4s, v9.16b, v3.4b[1]\n"
+    ".inst 0x4fa3e93f  // sdot v31.4s, v9.16b, v3.4b[3]\n"
+    "ldr q16, [%x[params], #0x10]\n"
+    ".inst 0x4fa1e178  // sdot v24.4s, v11.16b, v1.4b[1]\n"
+    ".inst 0x4fa1e979  // sdot v25.4s, v11.16b, v1.4b[3]\n"
+    ".inst 0x4fa2e17a  // sdot v26.4s, v11.16b, v2.4b[1]\n"
+    ".inst 0x4fa2e97b  // sdot v27.4s, v11.16b, v2.4b[3]\n"
+    ".inst 0x4f83e15c  // sdot v28.4s, v10.16b, v3.4b[0]\n"
+    ".inst 0x4f83e95d  // sdot v29.4s, v10.16b, v3.4b[2]\n"
+    ".inst 0x4f84e15e  // sdot v30.4s, v10.16b, v4.4b[0]\n"
+    ".inst 0x4f84e95f  // sdot v31.4s, v10.16b, v4.4b[2]\n"
+    "ldr q19, [%x[params], #0x20]\n"
+    ".inst 0x4f82e238  // sdot v24.4s, v17.16b, v2.4b[0]\n"
+    ".inst 0x4f82ea39  // sdot v25.4s, v17.16b, v2.4b[2]\n"
+    ".inst 0x4f83e23a  // sdot v26.4s, v17.16b, v3.4b[0]\n"
+    ".inst 0x4f83ea3b  // sdot v27.4s, v17.16b, v3.4b[2]\n"
+    ".inst 0x4fa3e17c  // sdot v28.4s, v11.16b, v3.4b[1]\n"
+    ".inst 0x4fa3e97d  // sdot v29.4s, v11.16b, v3.4b[3]\n"
+    ".inst 0x4fa4e17e  // sdot v30.4s, v11.16b, v4.4b[1]\n"
+    ".inst 0x4fa4e97f  // sdot v31.4s, v11.16b, v4.4b[3]\n"
+    "ldr q18, [%x[params], #0x30]\n"
+    ".inst 0x4fa2e218  // sdot v24.4s, v16.16b, v2.4b[1]\n"
+    ".inst 0x4fa2ea19  // sdot v25.4s, v16.16b, v2.4b[3]\n"
+    ".inst 0x4fa3e21a  // sdot v26.4s, v16.16b, v3.4b[1]\n"
+    ".inst 0x4fa3ea1b  // sdot v27.4s, v16.16b, v3.4b[3]\n"
+    ".inst 0x4f84e23c  // sdot v28.4s, v17.16b, v4.4b[0]\n"
+    ".inst 0x4f84ea3d  // sdot v29.4s, v17.16b, v4.4b[2]\n"
+    ".inst 0x4f85e23e  // sdot v30.4s, v17.16b, v5.4b[0]\n"
+    ".inst 0x4f85ea3f  // sdot v31.4s, v17.16b, v5.4b[2]\n"
+    "ldr q17, [%x[params], #0x40]\n"
+    ".inst 0x4f83e278  // sdot v24.4s, v19.16b, v3.4b[0]\n"
+    ".inst 0x4f83ea79  // sdot v25.4s, v19.16b, v3.4b[2]\n"
+    ".inst 0x4f84e27a  // sdot v26.4s, v19.16b, v4.4b[0]\n"
+    ".inst 0x4f84ea7b  // sdot v27.4s, v19.16b, v4.4b[2]\n"
+    ".inst 0x4fa4e21c  // sdot v28.4s, v16.16b, v4.4b[1]\n"
+    ".inst 0x4fa4ea1d  // sdot v29.4s, v16.16b, v4.4b[3]\n"
+    ".inst 0x4fa5e21e  // sdot v30.4s, v16.16b, v5.4b[1]\n"
+    ".inst 0x4fa5ea1f  // sdot v31.4s, v16.16b, v5.4b[3]\n"
+    "ldr q16, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x80\n"
+    ".inst 0x4fa3e258  // sdot v24.4s, v18.16b, v3.4b[1]\n"
+    ".inst 0x4fa3ea59  // sdot v25.4s, v18.16b, v3.4b[3]\n"
+    ".inst 0x4fa4e25a  // sdot v26.4s, v18.16b, v4.4b[1]\n"
+    ".inst 0x4fa4ea5b  // sdot v27.4s, v18.16b, v4.4b[3]\n"
+    ".inst 0x4f85e27c  // sdot v28.4s, v19.16b, v5.4b[0]\n"
+    ".inst 0x4f85ea7d  // sdot v29.4s, v19.16b, v5.4b[2]\n"
+    ".inst 0x4f86e27e  // sdot v30.4s, v19.16b, v6.4b[0]\n"
+    ".inst 0x4f86ea7f  // sdot v31.4s, v19.16b, v6.4b[2]\n"
+    ".inst 0x4f84e238  // sdot v24.4s, v17.16b, v4.4b[0]\n"
+    ".inst 0x4f84ea39  // sdot v25.4s, v17.16b, v4.4b[2]\n"
+    ".inst 0x4f85e23a  // sdot v26.4s, v17.16b, v5.4b[0]\n"
+    ".inst 0x4f85ea3b  // sdot v27.4s, v17.16b, v5.4b[2]\n"
+    ".inst 0x4fa5e25c  // sdot v28.4s, v18.16b, v5.4b[1]\n"
+    ".inst 0x4fa5ea5d  // sdot v29.4s, v18.16b, v5.4b[3]\n"
+    ".inst 0x4fa6e25e  // sdot v30.4s, v18.16b, v6.4b[1]\n"
+    ".inst 0x4fa6ea5f  // sdot v31.4s, v18.16b, v6.4b[3]\n"
+    ".inst 0x4fa4e218  // sdot v24.4s, v16.16b, v4.4b[1]\n"
+    ".inst 0x4fa4ea19  // sdot v25.4s, v16.16b, v4.4b[3]\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    ".inst 0x4fa5e21a  // sdot v26.4s, v16.16b, v5.4b[1]\n"
+    ".inst 0x4fa5ea1b  // sdot v27.4s, v16.16b, v5.4b[3]\n"
+    "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+    ".inst 0x4f86e23c  // sdot v28.4s, v17.16b, v6.4b[0]\n"
+    ".inst 0x4f86ea3d  // sdot v29.4s, v17.16b, v6.4b[2]\n"
+    "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+    ".inst 0x4f87e23e  // sdot v30.4s, v17.16b, v7.4b[0]\n"
+    ".inst 0x4f87ea3f  // sdot v31.4s, v17.16b, v7.4b[2]\n"
+    "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+    ".inst 0x4fa6e21c  // sdot v28.4s, v16.16b, v6.4b[1]\n"
+    ".inst 0x4fa6ea1d  // sdot v29.4s, v16.16b, v6.4b[3]\n"
+    "and v19.16b, v24.16b, v20.16b\n"
+    ".inst 0x4fa7e21e  // sdot v30.4s, v16.16b, v7.4b[1]\n"
+    ".inst 0x4fa7ea1f  // sdot v31.4s, v16.16b, v7.4b[3]\n"
+    "and v18.16b, v25.16b, v20.16b\n"
+    "and v17.16b, v26.16b, v20.16b\n"
+    "and v16.16b, v27.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v19.4s\n"
+    "sqadd v25.4s, v25.4s, v18.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "and v19.16b, v28.16b, v20.16b\n"
+    "and v18.16b, v29.16b, v20.16b\n"
+    "and v17.16b, v30.16b, v20.16b\n"
+    "and v16.16b, v31.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v24.4s, v24.4s, v20.4s\n"
+    "srshl v25.4s, v25.4s, v20.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "srshl v27.4s, v27.4s, v20.4s\n"
+    "srshl v28.4s, v28.4s, v20.4s\n"
+    "srshl v29.4s, v29.4s, v20.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "add v24.4s, v24.4s, v13.4s\n"
+    "add v25.4s, v25.4s, v13.4s\n"
+    "add v26.4s, v26.4s, v13.4s\n"
+    "add v27.4s, v27.4s, v13.4s\n"
+    "add v28.4s, v28.4s, v13.4s\n"
+    "add v29.4s, v29.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "add v31.4s, v31.4s, v13.4s\n"
+    "smin v24.4s, v24.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v15.4s\n"
+    "smin v26.4s, v26.4s, v15.4s\n"
+    "smin v27.4s, v27.4s, v15.4s\n"
+    "smin v28.4s, v28.4s, v15.4s\n"
+    "smin v29.4s, v29.4s, v15.4s\n"
+    "smin v30.4s, v30.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "blt 3f\n"
+    "str s24, [x27, #0x0]\n"
+    "str s25, [x26, #0x0]\n"
+    "str s26, [x25, #0x0]\n"
+    "str s27, [x24, #0x0]\n"
+    "str s28, [x23, #0x0]\n"
+    "str s29, [x22, #0x0]\n"
+    "str s30, [x21, #0x0]\n"
+    "str s31, [x20, #0x0]\n"
+    "b 4f\n"
+    "3:"  // Tail: Oddments
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[0], [x27], #0x1\n"
+    "st1 { v25.b }[0], [x26], #0x1\n"
+    "st1 { v26.b }[0], [x25], #0x1\n"
+    "st1 { v27.b }[0], [x24], #0x1\n"
+    "st1 { v28.b }[0], [x23], #0x1\n"
+    "st1 { v29.b }[0], [x22], #0x1\n"
+    "st1 { v30.b }[0], [x21], #0x1\n"
+    "st1 { v31.b }[0], [x20], #0x1\n"
+    "beq 4f\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[1], [x27], #0x1\n"
+    "st1 { v25.b }[1], [x26], #0x1\n"
+    "st1 { v26.b }[1], [x25], #0x1\n"
+    "st1 { v27.b }[1], [x24], #0x1\n"
+    "st1 { v28.b }[1], [x23], #0x1\n"
+    "st1 { v29.b }[1], [x22], #0x1\n"
+    "st1 { v30.b }[1], [x21], #0x1\n"
+    "st1 { v31.b }[1], [x20], #0x1\n"
+    "beq 4f\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[2], [x27], #0x1\n"
+    "st1 { v25.b }[2], [x26], #0x1\n"
+    "st1 { v26.b }[2], [x25], #0x1\n"
+    "st1 { v27.b }[2], [x24], #0x1\n"
+    "st1 { v28.b }[2], [x23], #0x1\n"
+    "st1 { v29.b }[2], [x22], #0x1\n"
+    "st1 { v30.b }[2], [x21], #0x1\n"
+    "st1 { v31.b }[2], [x20], #0x1\n"
+    "beq 4f\n"
+    "st1 { v24.b }[3], [x27], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v25.b }[3], [x26], #0x1\n"
+    "st1 { v26.b }[3], [x25], #0x1\n"
+    "st1 { v27.b }[3], [x24], #0x1\n"
+    "st1 { v28.b }[3], [x23], #0x1\n"
+    "st1 { v29.b }[3], [x22], #0x1\n"
+    "st1 { v30.b }[3], [x21], #0x1\n"
+    "st1 { v31.b }[3], [x20], #0x1\n"
+    "4:"  // Tail: End
+    : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..3f71c5fb64
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+struct a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = GenericDepthfirstMultiplierKernelStrategy<int8_t, int8_t, int8_t, int32_t>;
+  a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+  : Parent(2, 8, arm_gemm::VLType::None)
+  {
+  }
+  Parent::KernelType kernel = a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..b21ad484e5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,1480 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const int8_t *weights,
+  const int32_t *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const int32_t *per_channel_left_shifts,
+  const int32_t *per_channel_muls,
+  const int32_t *per_channel_right_shifts,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "lsr x10, %x[n_output_channels], #0x2\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v13.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v12.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v11.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "ld1r { v10.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v9.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "ld1r { v8.4s }, [x20]\n"
+    "mov x9, #0x0\n"
+    "cbz x10, 9f\n"
+    "1:"  // Output channel loop
+    "movi v31.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x20, x9, #0x2\n"
+    "ldr q31, [%x[bias], x20]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz %x[rq_mul_ptr], 3f\n"
+    "lsl x20, x9, #0x2\n"
+    "ldr q9, [%x[rq_mul_ptr], x20]\n"
+    "ldr q8, [%x[rq_right_shift_ptr], x20]\n"
+    "cbz %x[rq_left_shift_ptr], 3f\n"
+    "ldr q10, [%x[rq_left_shift_ptr], x20]\n"
+    "3:"  // Output channel loop: Load quantization parameters: Done
+    "ldr s5, [%x[weights]], #0x4\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr d0, [x21, #0x0]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "cbz x23, 7f\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "ldr d3, [x21, #0x0]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "beq 5f\n"
+    "4:"  // Output channel loop: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "subs x23, x23, #0x1\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x21, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "ldr d3, [x21, #0x0]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "bgt 4b\n"
+    "5:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 6f\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "b 8f\n"
+    "6:"  // Output channel loop: Odd tail
+    "ldp x20, x28, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x20, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldr d4, [x28, #0x0]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "b 8f\n"
+    "7:"  // Output channel loop: Single kernel point
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "8:"  // Output channel loop: Done
+    "add x9, x9, #0x4\n"
+    "cmp x9, x10, LSL #2\n"
+    "blt 1b\n"
+    "tst %x[n_output_channels], #0x3\n"
+    "beq 26f\n"
+    "9:"  // Output channel oddments
+    "movi v31.4s, #0x0\n"
+    "cbz %x[bias], 12f\n"
+    "add x20, %x[bias], x9, LSL #2\n"
+    "tbz %x[n_output_channels], #1, 10f\n"
+    "ld1 { v31.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #0, 11f\n"
+    "ld1 { v31.s }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Output channel oddments: Load bias: Bit 1: Unset
+    "ld1 { v31.s }[0], [x20]\n"
+    "11:"  // Output channel oddments: Load bias: Bit 1: End
+    "12:"  // Output channel oddments: Load bias: Done
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz %x[rq_mul_ptr], 18f\n"
+    "add x22, %x[rq_mul_ptr], x9, LSL #2\n"
+    "add x21, %x[rq_right_shift_ptr], x9, LSL #2\n"
+    "add x20, %x[rq_left_shift_ptr], x9, LSL #2\n"
+    "cbz %x[rq_left_shift_ptr], 15f\n"
+    "tbz %x[n_output_channels], #1, 13f\n"
+    "ld1 { v9.d }[0], [x22], #0x8\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #0, 14f\n"
+    "ld1 { v9.s }[2], [x22], #0x4\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 14f\n"
+    "13:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "14:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
+    "b 18f\n"
+    "15:"  // Output channel oddments: Load quantization parameters: No left shift
+    "tbz %x[n_output_channels], #1, 16f\n"
+    "ld1 { v9.d }[0], [x22], #0x8\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "tbz %x[n_output_channels], #0, 17f\n"
+    "ld1 { v9.s }[2], [x22], #0x4\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "b 17f\n"
+    "16:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "17:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
+    "18:"  // Output channel oddments: Load quantization parameters: Done
+    "ldr s5, [%x[weights]], #0x4\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr d0, [x21, #0x0]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "cbz x23, 22f\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "ldr d3, [x21, #0x0]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "beq 20f\n"
+    "19:"  // Output channel oddments: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "subs x23, x23, #0x1\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x21, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "ldr d3, [x21, #0x0]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "bgt 19b\n"
+    "20:"  // Output channel oddments: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 21f\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "b 23f\n"
+    "21:"  // Output channel oddments: Odd tail
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d2, [x21, #0x0]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d1, [x20, #0x0]\n"
+    "ldr s0, [%x[weights]], #0x4\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "ssubl v0.8h, v0.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "smlal v16.4s, v0.4h, v2.h[0]\n"
+    "smlal v17.4s, v0.4h, v2.h[1]\n"
+    "smlal v18.4s, v0.4h, v2.h[2]\n"
+    "smlal v19.4s, v0.4h, v2.h[3]\n"
+    "smlal v20.4s, v0.4h, v2.h[4]\n"
+    "smlal v21.4s, v0.4h, v2.h[5]\n"
+    "smlal v22.4s, v0.4h, v2.h[6]\n"
+    "smlal v23.4s, v0.4h, v2.h[7]\n"
+    "smlal v24.4s, v0.4h, v1.h[0]\n"
+    "smlal v25.4s, v0.4h, v1.h[1]\n"
+    "smlal v26.4s, v0.4h, v1.h[2]\n"
+    "smlal v27.4s, v0.4h, v1.h[3]\n"
+    "smlal v28.4s, v0.4h, v1.h[4]\n"
+    "smlal v29.4s, v0.4h, v1.h[5]\n"
+    "smlal v30.4s, v0.4h, v1.h[6]\n"
+    "smlal v31.4s, v0.4h, v1.h[7]\n"
+    "b 23f\n"
+    "22:"  // Output channel oddments: Single kernel point
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "23:"  // Output channel oddments: Done
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "tbz %x[n_output_channels], #1, 24f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.h }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.h }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.h }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.h }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.h }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.h }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.h }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.h }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "add x9, x9, #0x2\n"
+    "st1 { v24.h }[0], [x27]\n"
+    "st1 { v25.h }[0], [x26]\n"
+    "st1 { v26.h }[0], [x25]\n"
+    "st1 { v27.h }[0], [x24]\n"
+    "st1 { v28.h }[0], [x23]\n"
+    "st1 { v29.h }[0], [x22]\n"
+    "st1 { v30.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "tbz %x[n_output_channels], #0, 25f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.b }[2], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.b }[2], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.b }[2], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.b }[2], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.b }[2], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.b }[2], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.b }[2], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.b }[2], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "st1 { v24.b }[2], [x27]\n"
+    "st1 { v25.b }[2], [x26]\n"
+    "st1 { v26.b }[2], [x25]\n"
+    "st1 { v27.b }[2], [x24]\n"
+    "st1 { v28.b }[2], [x23]\n"
+    "st1 { v29.b }[2], [x22]\n"
+    "st1 { v30.b }[2], [x21]\n"
+    "st1 { v31.b }[2], [x20]\n"
+    "b 25f\n"
+    "24:"  // Output channel oddments: Done: Store: Bit 1: Unset
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.b }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.b }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.b }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.b }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.b }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.b }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.b }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.b }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "st1 { v24.b }[0], [x27]\n"
+    "st1 { v25.b }[0], [x26]\n"
+    "st1 { v26.b }[0], [x25]\n"
+    "st1 { v27.b }[0], [x24]\n"
+    "st1 { v28.b }[0], [x23]\n"
+    "st1 { v29.b }[0], [x22]\n"
+    "st1 { v30.b }[0], [x21]\n"
+    "st1 { v31.b }[0], [x20]\n"
+    "25:"  // Output channel oddments: Done: Store: Bit 1: End
+    "26:"  // Done
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..3190cbfbf0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int , const int8_t *const *const , const int8_t *, const int32_t *, const arm_gemm::Requantize32& , const int32_t *, const int32_t *, int8_t *const *const );
+
+class a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleave_a64_s8q_3x3_dot::get_packed_size(args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleave_a64_s8q_3x3_dot::pack_parameters(
+      args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+      reinterpret_cast<const int8_t *>(weights), qp, ld_weight_col, ld_weight_row
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..aad34c4c25
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,1484 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
+{
+  __asm__ __volatile__(
+    "lsr x15, %x[n_channels], #0x4\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v8.4s }, [x20]\n"
+    "ldp x14, x13, [%x[inptrs], #0x0]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v12.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "mov x12, #0x0\n"
+    "mov x11, #0x0\n"
+    "ldp x10, x9, [%x[inptrs], #0x10]\n"
+    "ldp x28, x27, [%x[inptrs], #0x20]\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "ldp x25, x24, [%x[outptrs], #0x0]\n"
+    "ldp x23, x22, [%x[outptrs], #0x10]\n"
+    "cbz x15, 3f\n"
+    "ldr q11, [x14, x12]\n"
+    "ldr q20, [x13, x12]\n"
+    "subs x15, x15, #0x1\n"
+    "ldr q16, [x10, x12]\n"
+    "ldr q14, [x9, x12]\n"
+    "zip2 v19.16b, v11.16b, v16.16b\n"
+    "zip1 v11.16b, v11.16b, v16.16b\n"
+    "ldr q13, [x28, x12]\n"
+    "ldr q18, [x27, x12]\n"
+    "zip1 v17.16b, v20.16b, v14.16b\n"
+    "zip2 v14.16b, v20.16b, v14.16b\n"
+    "ldr q16, [x26, x12]\n"
+    "ldr q27, [x21, x12]\n"
+    "zip2 v10.16b, v11.16b, v17.16b\n"
+    "zip1 v11.16b, v11.16b, v17.16b\n"
+    "ldr q24, [%x[params], #0x10]\n"
+    "ldr q9, [%x[params], #0x20]\n"
+    "zip1 v3.16b, v19.16b, v14.16b\n"
+    "zip2 v14.16b, v19.16b, v14.16b\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q6, [%x[params], #0x30]\n"
+    "zip2 v30.16b, v13.16b, v16.16b\n"
+    "zip1 v13.16b, v13.16b, v16.16b\n"
+    "ldp x21, x20, [%x[inptrs], #0x40]\n"
+    "ldr q5, [x21, x12]\n"
+    "zip1 v16.16b, v18.16b, v27.16b\n"
+    "zip2 v27.16b, v18.16b, v27.16b\n"
+    "ldr q17, [x20, x12]\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "zip2 v28.16b, v13.16b, v16.16b\n"
+    "zip1 v13.16b, v13.16b, v16.16b\n"
+    "ldr q16, [x21, x12]\n"
+    "ldr q7, [x20, x12]\n"
+    "zip2 v20.16b, v5.16b, v16.16b\n"
+    "zip1 v5.16b, v5.16b, v16.16b\n"
+    "ldp x21, x20, [%x[inptrs], #0x60]\n"
+    "ldr q16, [x21, x12]\n"
+    "zip1 v22.16b, v17.16b, v7.16b\n"
+    "zip2 v7.16b, v17.16b, v7.16b\n"
+    "ldr q19, [x20, x12]\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "zip1 v21.16b, v30.16b, v27.16b\n"
+    "zip2 v27.16b, v30.16b, v27.16b\n"
+    "ldr q30, [x21, x12]\n"
+    "ldr q1, [x20, x12]\n"
+    "zip2 v17.16b, v16.16b, v30.16b\n"
+    "zip1 v16.16b, v16.16b, v30.16b\n"
+    "zip1 v18.16b, v19.16b, v1.16b\n"
+    "zip2 v1.16b, v19.16b, v1.16b\n"
+    "ldp x14, x13, [%x[inptrs], #0x0]\n"
+    "ldp x10, x9, [%x[inptrs], #0x10]\n"
+    "ldp x28, x27, [%x[inptrs], #0x20]\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "zip2 v29.16b, v5.16b, v22.16b\n"
+    "zip1 v5.16b, v5.16b, v22.16b\n"
+    "zip1 v0.16b, v20.16b, v7.16b\n"
+    "zip2 v7.16b, v20.16b, v7.16b\n"
+    "add %x[params], %x[params], #0x40\n"
+    "zip2 v30.16b, v16.16b, v18.16b\n"
+    "zip1 v16.16b, v16.16b, v18.16b\n"
+    "zip1 v2.16b, v17.16b, v1.16b\n"
+    "zip2 v1.16b, v17.16b, v1.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v4.16b, v31.16b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    ".inst 0x4e8b971f  // sdot v31.4s, v24.16b, v11.16b\n"
+    ".inst 0x4e8d9712  // sdot v18.4s, v24.16b, v13.16b\n"
+    "ext v11.16b, v11.16b, v11.16b, #0x1\n"
+    "add x12, x12, #0x10\n"
+    ".inst 0x4e8d953f  // sdot v31.4s, v9.16b, v13.16b\n"
+    "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+    ".inst 0x4e8b971a  // sdot v26.4s, v24.16b, v11.16b\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    ".inst 0x4e8d9704  // sdot v4.4s, v24.16b, v13.16b\n"
+    ".inst 0x4e859532  // sdot v18.4s, v9.16b, v5.16b\n"
+    "subs x15, x15, #0x1\n"
+    ".inst 0x4e8594df  // sdot v31.4s, v6.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x4e8d953a  // sdot v26.4s, v9.16b, v13.16b\n"
+    "ldr q20, [%x[params], #0x10]\n"
+    ".inst 0x4e859524  // sdot v4.4s, v9.16b, v5.16b\n"
+    ".inst 0x4e9094d2  // sdot v18.4s, v6.16b, v16.16b\n"
+    "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x4e8594da  // sdot v26.4s, v6.16b, v5.16b\n"
+    ".inst 0x4e9094c4  // sdot v4.4s, v6.16b, v16.16b\n"
+    "and v16.16b, v31.16b, v20.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+    "ldr q5, [%x[params], #0x60]\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "and v19.16b, v26.16b, v20.16b\n"
+    "and v17.16b, v18.16b, v20.16b\n"
+    "and v16.16b, v4.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "sqadd v26.4s, v26.4s, v19.4s\n"
+    "ldr q13, [%x[params], #0x40]\n"
+    "sqadd v18.4s, v18.4s, v17.4s\n"
+    "ldr q17, [%x[params], #0x50]\n"
+    "sqadd v4.4s, v4.4s, v16.4s\n"
+    "ldr q16, [%x[params], #0x30]\n"
+    "add v31.4s, v31.4s, v15.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "srshl v18.4s, v18.4s, v20.4s\n"
+    "srshl v4.4s, v4.4s, v20.4s\n"
+    "ldr q22, [%x[params], #0x70]\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "add v26.4s, v26.4s, v15.4s\n"
+    "add v18.4s, v18.4s, v15.4s\n"
+    "add v4.4s, v4.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v12.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v18.4s, v18.4s, v8.4s\n"
+    "smax v4.4s, v4.4s, v8.4s\n"
+    "smin v26.4s, v26.4s, v12.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "smin v4.4s, v4.4s, v12.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s31, [x25, x11]\n"
+    "ldr q24, [%x[params], #0x20]\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s26, [x24, x11]\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "str s18, [x23, x11]\n"
+    "mov v26.16b, v24.16b\n"
+    "str s4, [x22, x11]\n"
+    "mov v25.16b, v24.16b\n"
+    "mov v23.16b, v24.16b\n"
+    ".inst 0x4e8a9618  // sdot v24.4s, v16.16b, v10.16b\n"
+    ".inst 0x4e9c9619  // sdot v25.4s, v16.16b, v28.16b\n"
+    ".inst 0x4e9c95b8  // sdot v24.4s, v13.16b, v28.16b\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    "add x11, x11, #0x4\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    ".inst 0x4e8a961a  // sdot v26.4s, v16.16b, v10.16b\n"
+    "ldr q10, [x13, x12]\n"
+    ".inst 0x4e9c9617  // sdot v23.4s, v16.16b, v28.16b\n"
+    ".inst 0x4e9d95b9  // sdot v25.4s, v13.16b, v29.16b\n"
+    ".inst 0x4e9d9638  // sdot v24.4s, v17.16b, v29.16b\n"
+    "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+    ".inst 0x4e9c95ba  // sdot v26.4s, v13.16b, v28.16b\n"
+    "ldr q20, [x27, x12]\n"
+    ".inst 0x4e9d95b7  // sdot v23.4s, v13.16b, v29.16b\n"
+    "sqrdmulh v24.4s, v24.4s, v5.4s\n"
+    ".inst 0x4e9e9639  // sdot v25.4s, v17.16b, v30.16b\n"
+    "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+    ".inst 0x4e9d963a  // sdot v26.4s, v17.16b, v29.16b\n"
+    ".inst 0x4e9e9637  // sdot v23.4s, v17.16b, v30.16b\n"
+    "and v16.16b, v24.16b, v22.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v5.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v5.4s\n"
+    "ldr q19, [%x[params], #0xc0]\n"
+    "sqadd v24.4s, v24.4s, v16.4s\n"
+    "and v18.16b, v26.16b, v22.16b\n"
+    "and v17.16b, v25.16b, v22.16b\n"
+    "and v16.16b, v23.16b, v22.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v24.4s, v24.4s, v22.4s\n"
+    "sqadd v26.4s, v26.4s, v18.4s\n"
+    "ldr q18, [%x[params], #0xa0]\n"
+    "sqadd v25.4s, v25.4s, v17.4s\n"
+    "ldr q17, [%x[params], #0xb0]\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "ldr q16, [%x[params], #0x90]\n"
+    "add v24.4s, v24.4s, v15.4s\n"
+    "srshl v26.4s, v26.4s, v22.4s\n"
+    "srshl v25.4s, v25.4s, v22.4s\n"
+    "srshl v23.4s, v23.4s, v22.4s\n"
+    "ldr q22, [%x[params], #0xd0]\n"
+    "smax v24.4s, v24.4s, v8.4s\n"
+    "add v26.4s, v26.4s, v15.4s\n"
+    "add v25.4s, v25.4s, v15.4s\n"
+    "add v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v12.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "smax v23.4s, v23.4s, v8.4s\n"
+    "smin v26.4s, v26.4s, v12.4s\n"
+    "smin v25.4s, v25.4s, v12.4s\n"
+    "smin v23.4s, v23.4s, v12.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x25, x11]\n"
+    "ldr q24, [%x[params], #0x80]\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s26, [x24, x11]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s25, [x23, x11]\n"
+    "str s23, [x22, x11]\n"
+    "mov v23.16b, v24.16b\n"
+    "mov v31.16b, v24.16b\n"
+    ".inst 0x4e95961f  // sdot v31.4s, v16.16b, v21.16b\n"
+    "mov v13.16b, v24.16b\n"
+    ".inst 0x4e839618  // sdot v24.4s, v16.16b, v3.16b\n"
+    ".inst 0x4e959658  // sdot v24.4s, v18.16b, v21.16b\n"
+    "add x11, x11, #0x4\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+    ".inst 0x4e839617  // sdot v23.4s, v16.16b, v3.16b\n"
+    "ldr q3, [x10, x12]\n"
+    ".inst 0x4e95960d  // sdot v13.4s, v16.16b, v21.16b\n"
+    ".inst 0x4e80965f  // sdot v31.4s, v18.16b, v0.16b\n"
+    ".inst 0x4e809638  // sdot v24.4s, v17.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x4e959657  // sdot v23.4s, v18.16b, v21.16b\n"
+    "ldr q4, [x26, x12]\n"
+    ".inst 0x4e80964d  // sdot v13.4s, v18.16b, v0.16b\n"
+    ".inst 0x4e82963f  // sdot v31.4s, v17.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    "sqrdmulh v24.4s, v24.4s, v19.4s\n"
+    ".inst 0x4e809637  // sdot v23.4s, v17.16b, v0.16b\n"
+    ".inst 0x4e82962d  // sdot v13.4s, v17.16b, v2.16b\n"
+    "and v16.16b, v24.16b, v22.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v23.4s, v23.4s, v19.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v19.4s\n"
+    "sqrdmulh v13.4s, v13.4s, v19.4s\n"
+    "ldr q19, [%x[params], #0x120]\n"
+    "sqadd v24.4s, v24.4s, v16.4s\n"
+    "and v18.16b, v23.16b, v22.16b\n"
+    "and v17.16b, v31.16b, v22.16b\n"
+    "and v16.16b, v13.16b, v22.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v24.4s, v24.4s, v22.4s\n"
+    "sqadd v23.4s, v23.4s, v18.4s\n"
+    "ldr q18, [%x[params], #0x100]\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "ldr q17, [%x[params], #0x110]\n"
+    "sqadd v13.4s, v13.4s, v16.4s\n"
+    "ldr q16, [%x[params], #0xf0]\n"
+    "add v24.4s, v24.4s, v15.4s\n"
+    "srshl v23.4s, v23.4s, v22.4s\n"
+    "srshl v31.4s, v31.4s, v22.4s\n"
+    "srshl v13.4s, v13.4s, v22.4s\n"
+    "ldr q22, [%x[params], #0x130]\n"
+    "smax v24.4s, v24.4s, v8.4s\n"
+    "add v23.4s, v23.4s, v15.4s\n"
+    "add v31.4s, v31.4s, v15.4s\n"
+    "add v13.4s, v13.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v12.4s\n"
+    "smax v23.4s, v23.4s, v8.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smax v13.4s, v13.4s, v8.4s\n"
+    "smin v23.4s, v23.4s, v12.4s\n"
+    "smin v31.4s, v31.4s, v12.4s\n"
+    "smin v13.4s, v13.4s, v12.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s24, [x25, x11]\n"
+    "ldr q2, [%x[params], #0xe0]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s23, [x24, x11]\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "str s31, [x23, x11]\n"
+    "mov v25.16b, v2.16b\n"
+    "str s13, [x22, x11]\n"
+    "mov v21.16b, v2.16b\n"
+    "mov v30.16b, v2.16b\n"
+    ".inst 0x4e8e9602  // sdot v2.4s, v16.16b, v14.16b\n"
+    ".inst 0x4e9b9615  // sdot v21.4s, v16.16b, v27.16b\n"
+    ".inst 0x4e9b9642  // sdot v2.4s, v18.16b, v27.16b\n"
+    "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+    "add x11, x11, #0x4\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x4e8e9619  // sdot v25.4s, v16.16b, v14.16b\n"
+    "ldr q14, [x9, x12]\n"
+    ".inst 0x4e9b961e  // sdot v30.4s, v16.16b, v27.16b\n"
+    ".inst 0x4e879655  // sdot v21.4s, v18.16b, v7.16b\n"
+    ".inst 0x4e879622  // sdot v2.4s, v17.16b, v7.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    ".inst 0x4e9b9659  // sdot v25.4s, v18.16b, v27.16b\n"
+    "ldr q27, [x21, x12]\n"
+    ".inst 0x4e87965e  // sdot v30.4s, v18.16b, v7.16b\n"
+    "sqrdmulh v2.4s, v2.4s, v19.4s\n"
+    ".inst 0x4e819635  // sdot v21.4s, v17.16b, v1.16b\n"
+    "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+    ".inst 0x4e879639  // sdot v25.4s, v17.16b, v7.16b\n"
+    ".inst 0x4e81963e  // sdot v30.4s, v17.16b, v1.16b\n"
+    "and v16.16b, v2.16b, v22.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v25.4s, v25.4s, v19.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v19.4s\n"
+    "ldr q11, [x14, x12]\n"
+    "ldp x21, x20, [%x[inptrs], #0x40]\n"
+    "ldr q5, [x21, x12]\n"
+    "ldr q29, [x20, x12]\n"
+    "sqadd v2.4s, v2.4s, v16.4s\n"
+    "and v19.16b, v25.16b, v22.16b\n"
+    "and v17.16b, v21.16b, v22.16b\n"
+    "and v16.16b, v30.16b, v22.16b\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "ldr q26, [x21, x12]\n"
+    "ldr q7, [x20, x12]\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v2.4s, v2.4s, v22.4s\n"
+    "sqadd v25.4s, v25.4s, v19.4s\n"
+    "ldr q9, [%x[params], #0x160]\n"
+    "sqadd v21.4s, v21.4s, v17.4s\n"
+    "ldr q6, [%x[params], #0x170]\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "ldr q24, [%x[params], #0x150]\n"
+    "add v2.4s, v2.4s, v15.4s\n"
+    "srshl v25.4s, v25.4s, v22.4s\n"
+    "srshl v21.4s, v21.4s, v22.4s\n"
+    "srshl v30.4s, v30.4s, v22.4s\n"
+    "ldr q13, [x28, x12]\n"
+    "smax v2.4s, v2.4s, v8.4s\n"
+    "ldp x21, x20, [%x[inptrs], #0x60]\n"
+    "ldr q16, [x21, x12]\n"
+    "ldr q28, [x20, x12]\n"
+    "add v25.4s, v25.4s, v15.4s\n"
+    "add v21.4s, v21.4s, v15.4s\n"
+    "add v30.4s, v30.4s, v15.4s\n"
+    "smin v2.4s, v2.4s, v12.4s\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "ldr q23, [x21, x12]\n"
+    "ldr q1, [x20, x12]\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "smax v21.4s, v21.4s, v8.4s\n"
+    "ldp x14, x13, [%x[inptrs], #0x0]\n"
+    "smax v30.4s, v30.4s, v8.4s\n"
+    "smin v25.4s, v25.4s, v12.4s\n"
+    "ldp x10, x9, [%x[inptrs], #0x10]\n"
+    "ldp x28, x27, [%x[inptrs], #0x20]\n"
+    "smin v21.4s, v21.4s, v12.4s\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "str s2, [x25, x11]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "zip2 v18.16b, v11.16b, v3.16b\n"
+    "zip1 v11.16b, v11.16b, v3.16b\n"
+    "zip1 v17.16b, v10.16b, v14.16b\n"
+    "zip2 v14.16b, v10.16b, v14.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x24, x11]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s21, [x23, x11]\n"
+    "str s30, [x22, x11]\n"
+    "zip2 v10.16b, v11.16b, v17.16b\n"
+    "zip1 v11.16b, v11.16b, v17.16b\n"
+    "add x11, x11, #0x4\n"
+    "zip1 v3.16b, v18.16b, v14.16b\n"
+    "zip2 v14.16b, v18.16b, v14.16b\n"
+    "ldr q31, [%x[params], #0x140]\n"
+    "add %x[params], %x[params], #0x180\n"
+    "zip2 v22.16b, v13.16b, v4.16b\n"
+    "zip1 v13.16b, v13.16b, v4.16b\n"
+    "zip1 v2.16b, v20.16b, v27.16b\n"
+    "zip2 v27.16b, v20.16b, v27.16b\n"
+    "zip2 v19.16b, v5.16b, v26.16b\n"
+    "zip1 v5.16b, v5.16b, v26.16b\n"
+    "zip1 v18.16b, v29.16b, v7.16b\n"
+    "zip2 v7.16b, v29.16b, v7.16b\n"
+    "zip2 v4.16b, v16.16b, v23.16b\n"
+    "zip1 v16.16b, v16.16b, v23.16b\n"
+    "zip1 v17.16b, v28.16b, v1.16b\n"
+    "zip2 v1.16b, v28.16b, v1.16b\n"
+    "zip2 v28.16b, v13.16b, v2.16b\n"
+    "zip1 v13.16b, v13.16b, v2.16b\n"
+    "zip1 v21.16b, v22.16b, v27.16b\n"
+    "zip2 v27.16b, v22.16b, v27.16b\n"
+    "zip2 v29.16b, v5.16b, v18.16b\n"
+    "zip1 v5.16b, v5.16b, v18.16b\n"
+    "zip1 v0.16b, v19.16b, v7.16b\n"
+    "zip2 v7.16b, v19.16b, v7.16b\n"
+    "zip2 v30.16b, v16.16b, v17.16b\n"
+    "zip1 v16.16b, v16.16b, v17.16b\n"
+    "zip1 v2.16b, v4.16b, v1.16b\n"
+    "zip2 v1.16b, v4.16b, v1.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v4.16b, v31.16b\n"
+    "bgt 1b\n"
+    "2:"  // Detached iteration
+    ".inst 0x4e8b971f  // sdot v31.4s, v24.16b, v11.16b\n"
+    ".inst 0x4e8d9712  // sdot v18.4s, v24.16b, v13.16b\n"
+    "ext v11.16b, v11.16b, v11.16b, #0x1\n"
+    "tst %x[n_channels], #0xf\n"
+    ".inst 0x4e8d953f  // sdot v31.4s, v9.16b, v13.16b\n"
+    "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+    ".inst 0x4e8b971a  // sdot v26.4s, v24.16b, v11.16b\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    ".inst 0x4e8d9704  // sdot v4.4s, v24.16b, v13.16b\n"
+    ".inst 0x4e859532  // sdot v18.4s, v9.16b, v5.16b\n"
+    "add x12, x12, #0x10\n"
+    ".inst 0x4e8594df  // sdot v31.4s, v6.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x4e8d953a  // sdot v26.4s, v9.16b, v13.16b\n"
+    "ldr q19, [%x[params], #0x10]\n"
+    ".inst 0x4e859524  // sdot v4.4s, v9.16b, v5.16b\n"
+    ".inst 0x4e9094d2  // sdot v18.4s, v6.16b, v16.16b\n"
+    "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x4e8594da  // sdot v26.4s, v6.16b, v5.16b\n"
+    ".inst 0x4e9094c4  // sdot v4.4s, v6.16b, v16.16b\n"
+    "and v16.16b, v31.16b, v19.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+    "ldr q24, [%x[params], #0x60]\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "and v20.16b, v26.16b, v19.16b\n"
+    "and v17.16b, v18.16b, v19.16b\n"
+    "and v16.16b, v4.16b, v19.16b\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v19.4s\n"
+    "sqadd v26.4s, v26.4s, v20.4s\n"
+    "ldr q5, [%x[params], #0x40]\n"
+    "sqadd v18.4s, v18.4s, v17.4s\n"
+    "ldr q17, [%x[params], #0x50]\n"
+    "sqadd v4.4s, v4.4s, v16.4s\n"
+    "ldr q16, [%x[params], #0x30]\n"
+    "add v31.4s, v31.4s, v15.4s\n"
+    "srshl v26.4s, v26.4s, v19.4s\n"
+    "srshl v18.4s, v18.4s, v19.4s\n"
+    "srshl v4.4s, v4.4s, v19.4s\n"
+    "ldr q23, [%x[params], #0x70]\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "add v26.4s, v26.4s, v15.4s\n"
+    "add v18.4s, v18.4s, v15.4s\n"
+    "add v4.4s, v4.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v12.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v18.4s, v18.4s, v8.4s\n"
+    "smax v4.4s, v4.4s, v8.4s\n"
+    "smin v26.4s, v26.4s, v12.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "smin v4.4s, v4.4s, v12.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s31, [x25, x11]\n"
+    "ldr q25, [%x[params], #0x20]\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s26, [x24, x11]\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "str s18, [x23, x11]\n"
+    "mov v22.16b, v25.16b\n"
+    "str s4, [x22, x11]\n"
+    "mov v20.16b, v25.16b\n"
+    "mov v19.16b, v25.16b\n"
+    ".inst 0x4e8a9619  // sdot v25.4s, v16.16b, v10.16b\n"
+    ".inst 0x4e9c9614  // sdot v20.4s, v16.16b, v28.16b\n"
+    ".inst 0x4e9c94b9  // sdot v25.4s, v5.16b, v28.16b\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    "add x11, x11, #0x4\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    ".inst 0x4e8a9616  // sdot v22.4s, v16.16b, v10.16b\n"
+    ".inst 0x4e9c9613  // sdot v19.4s, v16.16b, v28.16b\n"
+    ".inst 0x4e9d94b4  // sdot v20.4s, v5.16b, v29.16b\n"
+    ".inst 0x4e9d9639  // sdot v25.4s, v17.16b, v29.16b\n"
+    "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+    ".inst 0x4e9c94b6  // sdot v22.4s, v5.16b, v28.16b\n"
+    ".inst 0x4e9d94b3  // sdot v19.4s, v5.16b, v29.16b\n"
+    "sqrdmulh v25.4s, v25.4s, v24.4s\n"
+    ".inst 0x4e9e9634  // sdot v20.4s, v17.16b, v30.16b\n"
+    "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+    ".inst 0x4e9d9636  // sdot v22.4s, v17.16b, v29.16b\n"
+    ".inst 0x4e9e9633  // sdot v19.4s, v17.16b, v30.16b\n"
+    "and v16.16b, v25.16b, v23.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+    "ldr q24, [%x[params], #0xc0]\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v18.16b, v22.16b, v23.16b\n"
+    "and v17.16b, v20.16b, v23.16b\n"
+    "and v16.16b, v19.16b, v23.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v25.4s, v25.4s, v23.4s\n"
+    "sqadd v22.4s, v22.4s, v18.4s\n"
+    "ldr q18, [%x[params], #0xa0]\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "ldr q17, [%x[params], #0xb0]\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "ldr q16, [%x[params], #0x90]\n"
+    "add v25.4s, v25.4s, v15.4s\n"
+    "srshl v22.4s, v22.4s, v23.4s\n"
+    "srshl v20.4s, v20.4s, v23.4s\n"
+    "srshl v19.4s, v19.4s, v23.4s\n"
+    "ldr q23, [%x[params], #0xd0]\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "add v22.4s, v22.4s, v15.4s\n"
+    "add v20.4s, v20.4s, v15.4s\n"
+    "add v19.4s, v19.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v8.4s\n"
+    "smax v20.4s, v20.4s, v8.4s\n"
+    "smax v19.4s, v19.4s, v8.4s\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x25, x11]\n"
+    "ldr q10, [%x[params], #0x80]\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x24, x11]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s20, [x23, x11]\n"
+    "str s19, [x22, x11]\n"
+    "mov v28.16b, v10.16b\n"
+    "mov v20.16b, v10.16b\n"
+    ".inst 0x4e959614  // sdot v20.4s, v16.16b, v21.16b\n"
+    "mov v19.16b, v10.16b\n"
+    ".inst 0x4e83960a  // sdot v10.4s, v16.16b, v3.16b\n"
+    ".inst 0x4e95964a  // sdot v10.4s, v18.16b, v21.16b\n"
+    "add x11, x11, #0x4\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+    ".inst 0x4e83961c  // sdot v28.4s, v16.16b, v3.16b\n"
+    ".inst 0x4e959613  // sdot v19.4s, v16.16b, v21.16b\n"
+    ".inst 0x4e809654  // sdot v20.4s, v18.16b, v0.16b\n"
+    ".inst 0x4e80962a  // sdot v10.4s, v17.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x4e95965c  // sdot v28.4s, v18.16b, v21.16b\n"
+    ".inst 0x4e809653  // sdot v19.4s, v18.16b, v0.16b\n"
+    ".inst 0x4e829634  // sdot v20.4s, v17.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    "sqrdmulh v10.4s, v10.4s, v24.4s\n"
+    ".inst 0x4e80963c  // sdot v28.4s, v17.16b, v0.16b\n"
+    ".inst 0x4e829633  // sdot v19.4s, v17.16b, v2.16b\n"
+    "and v16.16b, v10.16b, v23.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v24.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+    "ldr q24, [%x[params], #0x120]\n"
+    "sqadd v10.4s, v10.4s, v16.4s\n"
+    "and v18.16b, v28.16b, v23.16b\n"
+    "and v17.16b, v20.16b, v23.16b\n"
+    "and v16.16b, v19.16b, v23.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v10.4s, v10.4s, v23.4s\n"
+    "sqadd v28.4s, v28.4s, v18.4s\n"
+    "ldr q18, [%x[params], #0x100]\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "ldr q17, [%x[params], #0x110]\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "ldr q16, [%x[params], #0xf0]\n"
+    "add v10.4s, v10.4s, v15.4s\n"
+    "srshl v28.4s, v28.4s, v23.4s\n"
+    "srshl v20.4s, v20.4s, v23.4s\n"
+    "srshl v19.4s, v19.4s, v23.4s\n"
+    "ldr q23, [%x[params], #0x130]\n"
+    "smax v10.4s, v10.4s, v8.4s\n"
+    "add v28.4s, v28.4s, v15.4s\n"
+    "add v20.4s, v20.4s, v15.4s\n"
+    "add v19.4s, v19.4s, v15.4s\n"
+    "smin v10.4s, v10.4s, v12.4s\n"
+    "smax v28.4s, v28.4s, v8.4s\n"
+    "smax v20.4s, v20.4s, v8.4s\n"
+    "smax v19.4s, v19.4s, v8.4s\n"
+    "smin v28.4s, v28.4s, v12.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s10, [x25, x11]\n"
+    "ldr q22, [%x[params], #0xe0]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "add %x[params], %x[params], #0x140\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s28, [x24, x11]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s20, [x23, x11]\n"
+    "mov v21.16b, v22.16b\n"
+    "str s19, [x22, x11]\n"
+    "mov v20.16b, v22.16b\n"
+    "mov v19.16b, v22.16b\n"
+    ".inst 0x4e8e9616  // sdot v22.4s, v16.16b, v14.16b\n"
+    ".inst 0x4e9b9614  // sdot v20.4s, v16.16b, v27.16b\n"
+    ".inst 0x4e9b9656  // sdot v22.4s, v18.16b, v27.16b\n"
+    "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+    "add x11, x11, #0x4\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x4e8e9615  // sdot v21.4s, v16.16b, v14.16b\n"
+    ".inst 0x4e9b9613  // sdot v19.4s, v16.16b, v27.16b\n"
+    ".inst 0x4e879654  // sdot v20.4s, v18.16b, v7.16b\n"
+    ".inst 0x4e879636  // sdot v22.4s, v17.16b, v7.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    ".inst 0x4e9b9655  // sdot v21.4s, v18.16b, v27.16b\n"
+    ".inst 0x4e879653  // sdot v19.4s, v18.16b, v7.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+    ".inst 0x4e819634  // sdot v20.4s, v17.16b, v1.16b\n"
+    "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+    ".inst 0x4e879635  // sdot v21.4s, v17.16b, v7.16b\n"
+    ".inst 0x4e819633  // sdot v19.4s, v17.16b, v1.16b\n"
+    "and v16.16b, v22.16b, v23.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+    "sqadd v22.4s, v22.4s, v16.4s\n"
+    "and v18.16b, v21.16b, v23.16b\n"
+    "and v17.16b, v20.16b, v23.16b\n"
+    "and v16.16b, v19.16b, v23.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v21.4s, v21.4s, v18.4s\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "srshl v22.4s, v22.4s, v23.4s\n"
+    "srshl v21.4s, v21.4s, v23.4s\n"
+    "srshl v20.4s, v20.4s, v23.4s\n"
+    "srshl v19.4s, v19.4s, v23.4s\n"
+    "add v22.4s, v22.4s, v15.4s\n"
+    "add v21.4s, v21.4s, v15.4s\n"
+    "add v20.4s, v20.4s, v15.4s\n"
+    "add v19.4s, v19.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v8.4s\n"
+    "smax v21.4s, v21.4s, v8.4s\n"
+    "smax v20.4s, v20.4s, v8.4s\n"
+    "smax v19.4s, v19.4s, v8.4s\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "smin v21.4s, v21.4s, v12.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s22, [x25, x11]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s21, [x24, x11]\n"
+    "str s20, [x23, x11]\n"
+    "str s19, [x22, x11]\n"
+    "add x11, x11, #0x4\n"
+    "beq 35f\n"
+    "3:"  // Oddments
+    "and x20, %x[n_channels], #0xf\n"
+    "add x14, x14, x12\n"
+    "add x13, x13, x12\n"
+    "add x10, x10, x12\n"
+    "add x9, x9, x12\n"
+    "add x28, x28, x12\n"
+    "add x27, x27, x12\n"
+    "add x26, x26, x12\n"
+    "add x21, x21, x12\n"
+    "tbz %x[n_channels], #3, 7f\n"
+    "ldr d11, [x14], #0x8\n"
+    "ldr d10, [x13], #0x8\n"
+    "ldr d3, [x10], #0x8\n"
+    "ldr d14, [x9], #0x8\n"
+    "ldr d13, [x28], #0x8\n"
+    "ldr d28, [x27], #0x8\n"
+    "ldr d21, [x26], #0x8\n"
+    "ldr d27, [x21], #0x8\n"
+    "tbz %x[n_channels], #2, 5f\n"
+    "ld1 { v11.s }[2], [x14], #0x4\n"
+    "ld1 { v10.s }[2], [x13], #0x4\n"
+    "ld1 { v3.s }[2], [x10], #0x4\n"
+    "ld1 { v14.s }[2], [x9], #0x4\n"
+    "ld1 { v13.s }[2], [x28], #0x4\n"
+    "ld1 { v28.s }[2], [x27], #0x4\n"
+    "ld1 { v21.s }[2], [x26], #0x4\n"
+    "ld1 { v27.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v11.h }[6], [x14], #0x2\n"
+    "ld1 { v10.h }[6], [x13], #0x2\n"
+    "ld1 { v3.h }[6], [x10], #0x2\n"
+    "ld1 { v14.h }[6], [x9], #0x2\n"
+    "ld1 { v13.h }[6], [x28], #0x2\n"
+    "ld1 { v28.h }[6], [x27], #0x2\n"
+    "ld1 { v21.h }[6], [x26], #0x2\n"
+    "ld1 { v27.h }[6], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.b }[14], [x14], #0x1\n"
+    "ld1 { v10.b }[14], [x13], #0x1\n"
+    "ld1 { v3.b }[14], [x10], #0x1\n"
+    "ld1 { v14.b }[14], [x9], #0x1\n"
+    "ld1 { v13.b }[14], [x28], #0x1\n"
+    "ld1 { v28.b }[14], [x27], #0x1\n"
+    "ld1 { v21.b }[14], [x26], #0x1\n"
+    "ld1 { v27.b }[14], [x21], #0x1\n"
+    "b 11f\n"
+    "4:"  // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.b }[12], [x14], #0x1\n"
+    "ld1 { v10.b }[12], [x13], #0x1\n"
+    "ld1 { v3.b }[12], [x10], #0x1\n"
+    "ld1 { v14.b }[12], [x9], #0x1\n"
+    "ld1 { v13.b }[12], [x28], #0x1\n"
+    "ld1 { v28.b }[12], [x27], #0x1\n"
+    "ld1 { v21.b }[12], [x26], #0x1\n"
+    "ld1 { v27.b }[12], [x21], #0x1\n"
+    "b 11f\n"
+    "5:"  // Oddments: Load (A): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v11.h }[4], [x14], #0x2\n"
+    "ld1 { v10.h }[4], [x13], #0x2\n"
+    "ld1 { v3.h }[4], [x10], #0x2\n"
+    "ld1 { v14.h }[4], [x9], #0x2\n"
+    "ld1 { v13.h }[4], [x28], #0x2\n"
+    "ld1 { v28.h }[4], [x27], #0x2\n"
+    "ld1 { v21.h }[4], [x26], #0x2\n"
+    "ld1 { v27.h }[4], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.b }[10], [x14], #0x1\n"
+    "ld1 { v10.b }[10], [x13], #0x1\n"
+    "ld1 { v3.b }[10], [x10], #0x1\n"
+    "ld1 { v14.b }[10], [x9], #0x1\n"
+    "ld1 { v13.b }[10], [x28], #0x1\n"
+    "ld1 { v28.b }[10], [x27], #0x1\n"
+    "ld1 { v21.b }[10], [x26], #0x1\n"
+    "ld1 { v27.b }[10], [x21], #0x1\n"
+    "b 11f\n"
+    "6:"  // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.b }[8], [x14], #0x1\n"
+    "ld1 { v10.b }[8], [x13], #0x1\n"
+    "ld1 { v3.b }[8], [x10], #0x1\n"
+    "ld1 { v14.b }[8], [x9], #0x1\n"
+    "ld1 { v13.b }[8], [x28], #0x1\n"
+    "ld1 { v28.b }[8], [x27], #0x1\n"
+    "ld1 { v21.b }[8], [x26], #0x1\n"
+    "ld1 { v27.b }[8], [x21], #0x1\n"
+    "b 11f\n"
+    "7:"  // Oddments: Load (A): Bit 3: Unset
+    "tbz %x[n_channels], #2, 9f\n"
+    "ldr s11, [x14], #0x4\n"
+    "ldr s10, [x13], #0x4\n"
+    "ldr s3, [x10], #0x4\n"
+    "ldr s14, [x9], #0x4\n"
+    "ldr s13, [x28], #0x4\n"
+    "ldr s28, [x27], #0x4\n"
+    "ldr s21, [x26], #0x4\n"
+    "ldr s27, [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v11.h }[2], [x14], #0x2\n"
+    "ld1 { v10.h }[2], [x13], #0x2\n"
+    "ld1 { v3.h }[2], [x10], #0x2\n"
+    "ld1 { v14.h }[2], [x9], #0x2\n"
+    "ld1 { v13.h }[2], [x28], #0x2\n"
+    "ld1 { v28.h }[2], [x27], #0x2\n"
+    "ld1 { v21.h }[2], [x26], #0x2\n"
+    "ld1 { v27.h }[2], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.b }[6], [x14], #0x1\n"
+    "ld1 { v10.b }[6], [x13], #0x1\n"
+    "ld1 { v3.b }[6], [x10], #0x1\n"
+    "ld1 { v14.b }[6], [x9], #0x1\n"
+    "ld1 { v13.b }[6], [x28], #0x1\n"
+    "ld1 { v28.b }[6], [x27], #0x1\n"
+    "ld1 { v21.b }[6], [x26], #0x1\n"
+    "ld1 { v27.b }[6], [x21], #0x1\n"
+    "b 11f\n"
+    "8:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.b }[4], [x14], #0x1\n"
+    "ld1 { v10.b }[4], [x13], #0x1\n"
+    "ld1 { v3.b }[4], [x10], #0x1\n"
+    "ld1 { v14.b }[4], [x9], #0x1\n"
+    "ld1 { v13.b }[4], [x28], #0x1\n"
+    "ld1 { v28.b }[4], [x27], #0x1\n"
+    "ld1 { v21.b }[4], [x26], #0x1\n"
+    "ld1 { v27.b }[4], [x21], #0x1\n"
+    "b 11f\n"
+    "9:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 10f\n"
+    "ldr h11, [x14], #0x2\n"
+    "ldr h10, [x13], #0x2\n"
+    "ldr h3, [x10], #0x2\n"
+    "ldr h14, [x9], #0x2\n"
+    "ldr h13, [x28], #0x2\n"
+    "ldr h28, [x27], #0x2\n"
+    "ldr h21, [x26], #0x2\n"
+    "ldr h27, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.b }[2], [x14], #0x1\n"
+    "ld1 { v10.b }[2], [x13], #0x1\n"
+    "ld1 { v3.b }[2], [x10], #0x1\n"
+    "ld1 { v14.b }[2], [x9], #0x1\n"
+    "ld1 { v13.b }[2], [x28], #0x1\n"
+    "ld1 { v28.b }[2], [x27], #0x1\n"
+    "ld1 { v21.b }[2], [x26], #0x1\n"
+    "ld1 { v27.b }[2], [x21], #0x1\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "ldr b11, [x14], #0x1\n"
+    "ldr b10, [x13], #0x1\n"
+    "ldr b3, [x10], #0x1\n"
+    "ldr b14, [x9], #0x1\n"
+    "ldr b13, [x28], #0x1\n"
+    "ldr b28, [x27], #0x1\n"
+    "ldr b21, [x26], #0x1\n"
+    "ldr b27, [x21], #0x1\n"
+    "11:"  // Oddments: Load (A): Bit 3: End
+    "ldp x14, x13, [%x[inptrs], #0x40]\n"
+    "ldp x10, x9, [%x[inptrs], #0x50]\n"
+    "add x14, x14, x12\n"
+    "add x13, x13, x12\n"
+    "ldp x28, x27, [%x[inptrs], #0x60]\n"
+    "ldp x26, x21, [%x[inptrs], #0x70]\n"
+    "add x10, x10, x12\n"
+    "add x9, x9, x12\n"
+    "add x28, x28, x12\n"
+    "add x27, x27, x12\n"
+    "add x26, x26, x12\n"
+    "add x21, x21, x12\n"
+    "tbz %x[n_channels], #3, 15f\n"
+    "ldr d5, [x14], #0x8\n"
+    "ldr d29, [x13], #0x8\n"
+    "ldr d0, [x10], #0x8\n"
+    "ldr d7, [x9], #0x8\n"
+    "ldr d16, [x28], #0x8\n"
+    "ldr d30, [x27], #0x8\n"
+    "ldr d2, [x26], #0x8\n"
+    "ldr d1, [x21], #0x8\n"
+    "tbz %x[n_channels], #2, 13f\n"
+    "ld1 { v5.s }[2], [x14], #0x4\n"
+    "ld1 { v29.s }[2], [x13], #0x4\n"
+    "ld1 { v0.s }[2], [x10], #0x4\n"
+    "ld1 { v7.s }[2], [x9], #0x4\n"
+    "ld1 { v16.s }[2], [x28], #0x4\n"
+    "ld1 { v30.s }[2], [x27], #0x4\n"
+    "ld1 { v2.s }[2], [x26], #0x4\n"
+    "ld1 { v1.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v5.h }[6], [x14], #0x2\n"
+    "ld1 { v29.h }[6], [x13], #0x2\n"
+    "ld1 { v0.h }[6], [x10], #0x2\n"
+    "ld1 { v7.h }[6], [x9], #0x2\n"
+    "ld1 { v16.h }[6], [x28], #0x2\n"
+    "ld1 { v30.h }[6], [x27], #0x2\n"
+    "ld1 { v2.h }[6], [x26], #0x2\n"
+    "ld1 { v1.h }[6], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v5.b }[14], [x14], #0x1\n"
+    "ld1 { v29.b }[14], [x13], #0x1\n"
+    "ld1 { v0.b }[14], [x10], #0x1\n"
+    "ld1 { v7.b }[14], [x9], #0x1\n"
+    "ld1 { v16.b }[14], [x28], #0x1\n"
+    "ld1 { v30.b }[14], [x27], #0x1\n"
+    "ld1 { v2.b }[14], [x26], #0x1\n"
+    "ld1 { v1.b }[14], [x21], #0x1\n"
+    "b 19f\n"
+    "12:"  // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v5.b }[12], [x14], #0x1\n"
+    "ld1 { v29.b }[12], [x13], #0x1\n"
+    "ld1 { v0.b }[12], [x10], #0x1\n"
+    "ld1 { v7.b }[12], [x9], #0x1\n"
+    "ld1 { v16.b }[12], [x28], #0x1\n"
+    "ld1 { v30.b }[12], [x27], #0x1\n"
+    "ld1 { v2.b }[12], [x26], #0x1\n"
+    "ld1 { v1.b }[12], [x21], #0x1\n"
+    "b 19f\n"
+    "13:"  // Oddments: Load (B): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v5.h }[4], [x14], #0x2\n"
+    "ld1 { v29.h }[4], [x13], #0x2\n"
+    "ld1 { v0.h }[4], [x10], #0x2\n"
+    "ld1 { v7.h }[4], [x9], #0x2\n"
+    "ld1 { v16.h }[4], [x28], #0x2\n"
+    "ld1 { v30.h }[4], [x27], #0x2\n"
+    "ld1 { v2.h }[4], [x26], #0x2\n"
+    "ld1 { v1.h }[4], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v5.b }[10], [x14], #0x1\n"
+    "ld1 { v29.b }[10], [x13], #0x1\n"
+    "ld1 { v0.b }[10], [x10], #0x1\n"
+    "ld1 { v7.b }[10], [x9], #0x1\n"
+    "ld1 { v16.b }[10], [x28], #0x1\n"
+    "ld1 { v30.b }[10], [x27], #0x1\n"
+    "ld1 { v2.b }[10], [x26], #0x1\n"
+    "ld1 { v1.b }[10], [x21], #0x1\n"
+    "b 19f\n"
+    "14:"  // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v5.b }[8], [x14], #0x1\n"
+    "ld1 { v29.b }[8], [x13], #0x1\n"
+    "ld1 { v0.b }[8], [x10], #0x1\n"
+    "ld1 { v7.b }[8], [x9], #0x1\n"
+    "ld1 { v16.b }[8], [x28], #0x1\n"
+    "ld1 { v30.b }[8], [x27], #0x1\n"
+    "ld1 { v2.b }[8], [x26], #0x1\n"
+    "ld1 { v1.b }[8], [x21], #0x1\n"
+    "b 19f\n"
+    "15:"  // Oddments: Load (B): Bit 3: Unset
+    "tbz %x[n_channels], #2, 17f\n"
+    "ldr s5, [x14], #0x4\n"
+    "ldr s29, [x13], #0x4\n"
+    "ldr s0, [x10], #0x4\n"
+    "ldr s7, [x9], #0x4\n"
+    "ldr s16, [x28], #0x4\n"
+    "ldr s30, [x27], #0x4\n"
+    "ldr s2, [x26], #0x4\n"
+    "ldr s1, [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v5.h }[2], [x14], #0x2\n"
+    "ld1 { v29.h }[2], [x13], #0x2\n"
+    "ld1 { v0.h }[2], [x10], #0x2\n"
+    "ld1 { v7.h }[2], [x9], #0x2\n"
+    "ld1 { v16.h }[2], [x28], #0x2\n"
+    "ld1 { v30.h }[2], [x27], #0x2\n"
+    "ld1 { v2.h }[2], [x26], #0x2\n"
+    "ld1 { v1.h }[2], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v5.b }[6], [x14], #0x1\n"
+    "ld1 { v29.b }[6], [x13], #0x1\n"
+    "ld1 { v0.b }[6], [x10], #0x1\n"
+    "ld1 { v7.b }[6], [x9], #0x1\n"
+    "ld1 { v16.b }[6], [x28], #0x1\n"
+    "ld1 { v30.b }[6], [x27], #0x1\n"
+    "ld1 { v2.b }[6], [x26], #0x1\n"
+    "ld1 { v1.b }[6], [x21], #0x1\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v5.b }[4], [x14], #0x1\n"
+    "ld1 { v29.b }[4], [x13], #0x1\n"
+    "ld1 { v0.b }[4], [x10], #0x1\n"
+    "ld1 { v7.b }[4], [x9], #0x1\n"
+    "ld1 { v16.b }[4], [x28], #0x1\n"
+    "ld1 { v30.b }[4], [x27], #0x1\n"
+    "ld1 { v2.b }[4], [x26], #0x1\n"
+    "ld1 { v1.b }[4], [x21], #0x1\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ldr h5, [x14], #0x2\n"
+    "ldr h29, [x13], #0x2\n"
+    "ldr h0, [x10], #0x2\n"
+    "ldr h7, [x9], #0x2\n"
+    "ldr h16, [x28], #0x2\n"
+    "ldr h30, [x27], #0x2\n"
+    "ldr h2, [x26], #0x2\n"
+    "ldr h1, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v5.b }[2], [x14], #0x1\n"
+    "ld1 { v29.b }[2], [x13], #0x1\n"
+    "ld1 { v0.b }[2], [x10], #0x1\n"
+    "ld1 { v7.b }[2], [x9], #0x1\n"
+    "ld1 { v16.b }[2], [x28], #0x1\n"
+    "ld1 { v30.b }[2], [x27], #0x1\n"
+    "ld1 { v2.b }[2], [x26], #0x1\n"
+    "ld1 { v1.b }[2], [x21], #0x1\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "ldr b5, [x14], #0x1\n"
+    "ldr b29, [x13], #0x1\n"
+    "ldr b0, [x10], #0x1\n"
+    "ldr b7, [x9], #0x1\n"
+    "ldr b16, [x28], #0x1\n"
+    "ldr b30, [x27], #0x1\n"
+    "ldr b2, [x26], #0x1\n"
+    "ldr b1, [x21], #0x1\n"
+    "19:"  // Oddments: Load (B): Bit 3: End
+    "ldr q25, [%x[params], #0x10]\n"
+    "ldr q24, [%x[params], #0x20]\n"
+    "zip2 v18.16b, v11.16b, v3.16b\n"
+    "zip1 v11.16b, v11.16b, v3.16b\n"
+    "ldr q23, [%x[params], #0x30]\n"
+    "zip1 v17.16b, v10.16b, v14.16b\n"
+    "zip2 v14.16b, v10.16b, v14.16b\n"
+    "cmp x20, #0x4\n"
+    "zip2 v10.16b, v11.16b, v17.16b\n"
+    "zip1 v11.16b, v11.16b, v17.16b\n"
+    "zip1 v3.16b, v18.16b, v14.16b\n"
+    "zip2 v14.16b, v18.16b, v14.16b\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "zip2 v22.16b, v13.16b, v21.16b\n"
+    "zip1 v13.16b, v13.16b, v21.16b\n"
+    "zip1 v21.16b, v28.16b, v27.16b\n"
+    "zip2 v27.16b, v28.16b, v27.16b\n"
+    "zip2 v20.16b, v5.16b, v0.16b\n"
+    "zip1 v5.16b, v5.16b, v0.16b\n"
+    "zip1 v19.16b, v29.16b, v7.16b\n"
+    "zip2 v7.16b, v29.16b, v7.16b\n"
+    "zip2 v18.16b, v16.16b, v2.16b\n"
+    "zip1 v16.16b, v16.16b, v2.16b\n"
+    "zip1 v17.16b, v30.16b, v1.16b\n"
+    "zip2 v1.16b, v30.16b, v1.16b\n"
+    "zip2 v28.16b, v13.16b, v21.16b\n"
+    "zip1 v13.16b, v13.16b, v21.16b\n"
+    "zip1 v21.16b, v22.16b, v27.16b\n"
+    "zip2 v27.16b, v22.16b, v27.16b\n"
+    "zip2 v29.16b, v5.16b, v19.16b\n"
+    "zip1 v5.16b, v5.16b, v19.16b\n"
+    "zip1 v0.16b, v20.16b, v7.16b\n"
+    "zip2 v7.16b, v20.16b, v7.16b\n"
+    "zip2 v30.16b, v16.16b, v17.16b\n"
+    "zip1 v16.16b, v16.16b, v17.16b\n"
+    "zip1 v2.16b, v18.16b, v1.16b\n"
+    "zip2 v1.16b, v18.16b, v1.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    ".inst 0x4e8d9732  // sdot v18.4s, v25.16b, v13.16b\n"
+    "mov v4.16b, v31.16b\n"
+    ".inst 0x4e8b973f  // sdot v31.4s, v25.16b, v11.16b\n"
+    ".inst 0x4e8d971f  // sdot v31.4s, v24.16b, v13.16b\n"
+    "ext v11.16b, v11.16b, v11.16b, #0x1\n"
+    "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+    ".inst 0x4e8b973a  // sdot v26.4s, v25.16b, v11.16b\n"
+    "ldr q17, [%x[params], #0x40]\n"
+    ".inst 0x4e8d9724  // sdot v4.4s, v25.16b, v13.16b\n"
+    ".inst 0x4e859712  // sdot v18.4s, v24.16b, v5.16b\n"
+    ".inst 0x4e8596ff  // sdot v31.4s, v23.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x4e8d971a  // sdot v26.4s, v24.16b, v13.16b\n"
+    "ldr q20, [%x[params], #0x50]\n"
+    ".inst 0x4e859704  // sdot v4.4s, v24.16b, v5.16b\n"
+    ".inst 0x4e9096f2  // sdot v18.4s, v23.16b, v16.16b\n"
+    "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+    "add %x[params], %x[params], #0x60\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x4e8596fa  // sdot v26.4s, v23.16b, v5.16b\n"
+    ".inst 0x4e9096e4  // sdot v4.4s, v23.16b, v16.16b\n"
+    "and v16.16b, v31.16b, v20.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "and v19.16b, v26.16b, v20.16b\n"
+    "and v17.16b, v18.16b, v20.16b\n"
+    "and v16.16b, v4.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v19.4s\n"
+    "sqadd v18.4s, v18.4s, v17.4s\n"
+    "sqadd v4.4s, v4.4s, v16.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "srshl v18.4s, v18.4s, v20.4s\n"
+    "srshl v4.4s, v4.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v15.4s\n"
+    "add v26.4s, v26.4s, v15.4s\n"
+    "add v18.4s, v18.4s, v15.4s\n"
+    "add v4.4s, v4.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v18.4s, v18.4s, v8.4s\n"
+    "smax v4.4s, v4.4s, v8.4s\n"
+    "smin v31.4s, v31.4s, v12.4s\n"
+    "smin v26.4s, v26.4s, v12.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "smin v4.4s, v4.4s, v12.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "blt 20f\n"
+    "str s31, [x25, x11]\n"
+    "str s26, [x24, x11]\n"
+    "str s18, [x23, x11]\n"
+    "str s4, [x22, x11]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Unroll 0: Oddment store
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "tbz x20, #1, 21f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v26.h }[0], [x24], #0x2\n"
+    "st1 { v18.h }[0], [x23], #0x2\n"
+    "st1 { v4.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 22f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v26.b }[2], [x24], #0x1\n"
+    "st1 { v18.b }[2], [x23], #0x1\n"
+    "st1 { v4.b }[2], [x22], #0x1\n"
+    "b 22f\n"
+    "21:"  // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v26.b }[0], [x24], #0x1\n"
+    "st1 { v18.b }[0], [x23], #0x1\n"
+    "st1 { v4.b }[0], [x22], #0x1\n"
+    "22:"  // Oddments: Unroll 0: Oddment store: Bit 1: End
+    "23:"  // Oddments: Unroll 0: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x11, x11, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q23, [%x[params], #0x10]\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "ldr q22, [%x[params], #0x20]\n"
+    "ldr q16, [%x[params], #0x30]\n"
+    "mov v4.16b, v31.16b\n"
+    ".inst 0x4e8a96ff  // sdot v31.4s, v23.16b, v10.16b\n"
+    "ldr q17, [%x[params], #0x40]\n"
+    "ldr q20, [%x[params], #0x50]\n"
+    ".inst 0x4e9c96f2  // sdot v18.4s, v23.16b, v28.16b\n"
+    ".inst 0x4e9c96df  // sdot v31.4s, v22.16b, v28.16b\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    ".inst 0x4e8a96fa  // sdot v26.4s, v23.16b, v10.16b\n"
+    "cmp x20, #0x4\n"
+    ".inst 0x4e9c96e4  // sdot v4.4s, v23.16b, v28.16b\n"
+    ".inst 0x4e9d96d2  // sdot v18.4s, v22.16b, v29.16b\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x4e9d961f  // sdot v31.4s, v16.16b, v29.16b\n"
+    "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+    ".inst 0x4e9c96da  // sdot v26.4s, v22.16b, v28.16b\n"
+    ".inst 0x4e9d96c4  // sdot v4.4s, v22.16b, v29.16b\n"
+    ".inst 0x4e9e9612  // sdot v18.4s, v16.16b, v30.16b\n"
+    "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x4e9d961a  // sdot v26.4s, v16.16b, v29.16b\n"
+    ".inst 0x4e9e9604  // sdot v4.4s, v16.16b, v30.16b\n"
+    "and v16.16b, v31.16b, v20.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "and v19.16b, v26.16b, v20.16b\n"
+    "and v17.16b, v18.16b, v20.16b\n"
+    "and v16.16b, v4.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v19.4s\n"
+    "sqadd v18.4s, v18.4s, v17.4s\n"
+    "sqadd v4.4s, v4.4s, v16.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "srshl v18.4s, v18.4s, v20.4s\n"
+    "srshl v4.4s, v4.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v15.4s\n"
+    "add v26.4s, v26.4s, v15.4s\n"
+    "add v18.4s, v18.4s, v15.4s\n"
+    "add v4.4s, v4.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v18.4s, v18.4s, v8.4s\n"
+    "smax v4.4s, v4.4s, v8.4s\n"
+    "smin v31.4s, v31.4s, v12.4s\n"
+    "smin v26.4s, v26.4s, v12.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "smin v4.4s, v4.4s, v12.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "blt 24f\n"
+    "str s31, [x25, x11]\n"
+    "str s26, [x24, x11]\n"
+    "str s18, [x23, x11]\n"
+    "str s4, [x22, x11]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Unroll 1: Oddment store
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "tbz x20, #1, 25f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v26.h }[0], [x24], #0x2\n"
+    "st1 { v18.h }[0], [x23], #0x2\n"
+    "st1 { v4.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 26f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v26.b }[2], [x24], #0x1\n"
+    "st1 { v18.b }[2], [x23], #0x1\n"
+    "st1 { v4.b }[2], [x22], #0x1\n"
+    "b 26f\n"
+    "25:"  // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v26.b }[0], [x24], #0x1\n"
+    "st1 { v18.b }[0], [x23], #0x1\n"
+    "st1 { v4.b }[0], [x22], #0x1\n"
+    "26:"  // Oddments: Unroll 1: Oddment store: Bit 1: End
+    "27:"  // Oddments: Unroll 1: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x11, x11, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q23, [%x[params], #0x10]\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "ldr q22, [%x[params], #0x20]\n"
+    "ldr q16, [%x[params], #0x30]\n"
+    "mov v4.16b, v31.16b\n"
+    ".inst 0x4e8396ff  // sdot v31.4s, v23.16b, v3.16b\n"
+    "ldr q17, [%x[params], #0x40]\n"
+    "ldr q20, [%x[params], #0x50]\n"
+    ".inst 0x4e9596f2  // sdot v18.4s, v23.16b, v21.16b\n"
+    ".inst 0x4e9596df  // sdot v31.4s, v22.16b, v21.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+    ".inst 0x4e8396fa  // sdot v26.4s, v23.16b, v3.16b\n"
+    "cmp x20, #0x4\n"
+    ".inst 0x4e9596e4  // sdot v4.4s, v23.16b, v21.16b\n"
+    ".inst 0x4e8096d2  // sdot v18.4s, v22.16b, v0.16b\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x4e80961f  // sdot v31.4s, v16.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x4e9596da  // sdot v26.4s, v22.16b, v21.16b\n"
+    ".inst 0x4e8096c4  // sdot v4.4s, v22.16b, v0.16b\n"
+    ".inst 0x4e829612  // sdot v18.4s, v16.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x4e80961a  // sdot v26.4s, v16.16b, v0.16b\n"
+    ".inst 0x4e829604  // sdot v4.4s, v16.16b, v2.16b\n"
+    "and v16.16b, v31.16b, v20.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "and v19.16b, v26.16b, v20.16b\n"
+    "and v17.16b, v18.16b, v20.16b\n"
+    "and v16.16b, v4.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v19.4s\n"
+    "sqadd v18.4s, v18.4s, v17.4s\n"
+    "sqadd v4.4s, v4.4s, v16.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "srshl v18.4s, v18.4s, v20.4s\n"
+    "srshl v4.4s, v4.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v15.4s\n"
+    "add v26.4s, v26.4s, v15.4s\n"
+    "add v18.4s, v18.4s, v15.4s\n"
+    "add v4.4s, v4.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v18.4s, v18.4s, v8.4s\n"
+    "smax v4.4s, v4.4s, v8.4s\n"
+    "smin v31.4s, v31.4s, v12.4s\n"
+    "smin v26.4s, v26.4s, v12.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "smin v4.4s, v4.4s, v12.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "blt 28f\n"
+    "str s31, [x25, x11]\n"
+    "str s26, [x24, x11]\n"
+    "str s18, [x23, x11]\n"
+    "str s4, [x22, x11]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Unroll 2: Oddment store
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "tbz x20, #1, 29f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v26.h }[0], [x24], #0x2\n"
+    "st1 { v18.h }[0], [x23], #0x2\n"
+    "st1 { v4.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 30f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v26.b }[2], [x24], #0x1\n"
+    "st1 { v18.b }[2], [x23], #0x1\n"
+    "st1 { v4.b }[2], [x22], #0x1\n"
+    "b 30f\n"
+    "29:"  // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v26.b }[0], [x24], #0x1\n"
+    "st1 { v18.b }[0], [x23], #0x1\n"
+    "st1 { v4.b }[0], [x22], #0x1\n"
+    "30:"  // Oddments: Unroll 2: Oddment store: Bit 1: End
+    "31:"  // Oddments: Unroll 2: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x11, x11, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q20, [%x[params], #0x10]\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "ldr q19, [%x[params], #0x20]\n"
+    "ldr q16, [%x[params], #0x30]\n"
+    "mov v4.16b, v31.16b\n"
+    ".inst 0x4e8e969f  // sdot v31.4s, v20.16b, v14.16b\n"
+    "ldr q17, [%x[params], #0x40]\n"
+    "ldr q22, [%x[params], #0x50]\n"
+    ".inst 0x4e9b9692  // sdot v18.4s, v20.16b, v27.16b\n"
+    ".inst 0x4e9b967f  // sdot v31.4s, v19.16b, v27.16b\n"
+    "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x4e8e969a  // sdot v26.4s, v20.16b, v14.16b\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x4e9b9684  // sdot v4.4s, v20.16b, v27.16b\n"
+    ".inst 0x4e879672  // sdot v18.4s, v19.16b, v7.16b\n"
+    ".inst 0x4e87961f  // sdot v31.4s, v16.16b, v7.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    ".inst 0x4e9b967a  // sdot v26.4s, v19.16b, v27.16b\n"
+    ".inst 0x4e879664  // sdot v4.4s, v19.16b, v7.16b\n"
+    ".inst 0x4e819612  // sdot v18.4s, v16.16b, v1.16b\n"
+    "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x4e87961a  // sdot v26.4s, v16.16b, v7.16b\n"
+    ".inst 0x4e819604  // sdot v4.4s, v16.16b, v1.16b\n"
+    "and v16.16b, v31.16b, v22.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "and v23.16b, v26.16b, v22.16b\n"
+    "and v17.16b, v18.16b, v22.16b\n"
+    "and v16.16b, v4.16b, v22.16b\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v23.4s\n"
+    "sqadd v18.4s, v18.4s, v17.4s\n"
+    "sqadd v4.4s, v4.4s, v16.4s\n"
+    "srshl v31.4s, v31.4s, v22.4s\n"
+    "srshl v26.4s, v26.4s, v22.4s\n"
+    "srshl v18.4s, v18.4s, v22.4s\n"
+    "srshl v4.4s, v4.4s, v22.4s\n"
+    "add v31.4s, v31.4s, v15.4s\n"
+    "add v26.4s, v26.4s, v15.4s\n"
+    "add v18.4s, v18.4s, v15.4s\n"
+    "add v4.4s, v4.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v18.4s, v18.4s, v8.4s\n"
+    "smax v4.4s, v4.4s, v8.4s\n"
+    "smin v31.4s, v31.4s, v12.4s\n"
+    "smin v26.4s, v26.4s, v12.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "smin v4.4s, v4.4s, v12.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "32:"  // Oddments: Unroll 3: Oddment store
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "tbz x20, #1, 33f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v26.h }[0], [x24], #0x2\n"
+    "st1 { v18.h }[0], [x23], #0x2\n"
+    "st1 { v4.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 34f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v26.b }[2], [x24], #0x1\n"
+    "st1 { v18.b }[2], [x23], #0x1\n"
+    "st1 { v4.b }[2], [x22], #0x1\n"
+    "b 34f\n"
+    "33:"  // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v26.b }[0], [x24], #0x1\n"
+    "st1 { v18.b }[0], [x23], #0x1\n"
+    "st1 { v4.b }[0], [x22], #0x1\n"
+    "34:"  // Oddments: Unroll 3: Oddment store: Bit 1: End
+    "35:"  // End
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..4026855617
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int, const uint8_t *const *const, const uint8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, uint8_t *const *const);
+
+class a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleave_a64_u8q_3x3_dot::get_packed_size(args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleave_a64_u8q_3x3_dot::pack_parameters(
+      args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+      reinterpret_cast<const uint8_t *>(weights), qp, ld_weight_col, ld_weight_row
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..5a28daffbf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,1658 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const uint8_t *const *const inptrs, const uint8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, uint8_t *const *const outptrs)
+{
+  __asm__ __volatile__(
+    "mov x20, #0x1\n"
+    "orr x20, x20, #0x100\n"
+    "ldp x15, x14, [%x[inptrs], #0x0]\n"
+    "ldp x13, x12, [%x[inptrs], #0x10]\n"
+    "orr x20, x20, #0x10000\n"
+    "lsr x11, %x[n_channels], #0x4\n"
+    "dup v12.4s, w20\n"
+    "ldp x10, x9, [%x[inptrs], #0x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v13.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v11.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "mov x28, #0x0\n"
+    "mov x27, #0x0\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "ldp x25, x24, [%x[outptrs], #0x0]\n"
+    "ldp x23, x22, [%x[outptrs], #0x10]\n"
+    "cbz x11, 3f\n"
+    "ldr q15, [x15, x28]\n"
+    "ldr q28, [x14, x28]\n"
+    "subs x11, x11, #0x1\n"
+    "ldr q30, [x13, x28]\n"
+    "ldr q8, [x12, x28]\n"
+    "zip2 v19.16b, v15.16b, v30.16b\n"
+    "zip1 v15.16b, v15.16b, v30.16b\n"
+    "ldr q26, [x10, x28]\n"
+    "ldr q0, [x9, x28]\n"
+    "zip1 v7.16b, v28.16b, v8.16b\n"
+    "zip2 v8.16b, v28.16b, v8.16b\n"
+    "ldr q29, [x26, x28]\n"
+    "ldr q10, [x21, x28]\n"
+    "zip2 v25.16b, v15.16b, v7.16b\n"
+    "zip1 v15.16b, v15.16b, v7.16b\n"
+    "ldr q1, [%x[params], #0x10]\n"
+    "ldr q6, [%x[params], #0x20]\n"
+    "zip1 v7.16b, v19.16b, v8.16b\n"
+    "zip2 v8.16b, v19.16b, v8.16b\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q20, [%x[params], #0x30]\n"
+    "zip2 v21.16b, v26.16b, v29.16b\n"
+    "zip1 v26.16b, v26.16b, v29.16b\n"
+    "ldp x21, x20, [%x[inptrs], #0x40]\n"
+    "ldr q22, [x21, x28]\n"
+    "zip1 v27.16b, v0.16b, v10.16b\n"
+    "zip2 v10.16b, v0.16b, v10.16b\n"
+    "ldr q17, [x20, x28]\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "zip2 v23.16b, v26.16b, v27.16b\n"
+    "zip1 v26.16b, v26.16b, v27.16b\n"
+    "ldr q9, [x21, x28]\n"
+    "ldr q5, [x20, x28]\n"
+    "zip2 v28.16b, v22.16b, v9.16b\n"
+    "zip1 v22.16b, v22.16b, v9.16b\n"
+    "ldp x21, x20, [%x[inptrs], #0x60]\n"
+    "ldr q27, [x21, x28]\n"
+    "zip1 v24.16b, v17.16b, v5.16b\n"
+    "zip2 v5.16b, v17.16b, v5.16b\n"
+    "ldr q18, [x20, x28]\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "zip1 v3.16b, v21.16b, v10.16b\n"
+    "zip2 v10.16b, v21.16b, v10.16b\n"
+    "ldr q4, [x21, x28]\n"
+    "ldr q9, [x20, x28]\n"
+    "zip2 v17.16b, v27.16b, v4.16b\n"
+    "zip1 v27.16b, v27.16b, v4.16b\n"
+    "zip1 v4.16b, v18.16b, v9.16b\n"
+    "zip2 v9.16b, v18.16b, v9.16b\n"
+    "ldp x15, x14, [%x[inptrs], #0x0]\n"
+    "ldp x13, x12, [%x[inptrs], #0x10]\n"
+    "ldp x10, x9, [%x[inptrs], #0x20]\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "zip2 v19.16b, v22.16b, v24.16b\n"
+    "zip1 v22.16b, v22.16b, v24.16b\n"
+    "zip1 v0.16b, v28.16b, v5.16b\n"
+    "zip2 v5.16b, v28.16b, v5.16b\n"
+    "add %x[params], %x[params], #0x40\n"
+    "zip2 v24.16b, v27.16b, v4.16b\n"
+    "zip1 v27.16b, v27.16b, v4.16b\n"
+    "zip1 v2.16b, v17.16b, v9.16b\n"
+    "zip2 v9.16b, v17.16b, v9.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "movi v21.4s, #0x0\n"
+    ".inst 0x6e9a9595  // udot v21.4s, v12.16b, v26.16b\n"
+    ".inst 0x6e8f943f  // udot v31.4s, v1.16b, v15.16b\n"
+    "add x28, x28, #0x10\n"
+    ".inst 0x6e969595  // udot v21.4s, v12.16b, v22.16b\n"
+    ".inst 0x6e9a943d  // udot v29.4s, v1.16b, v26.16b\n"
+    "movi v18.4s, #0x0\n"
+    "subs x11, x11, #0x1\n"
+    ".inst 0x6e9a94df  // udot v31.4s, v6.16b, v26.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "mov v17.16b, v21.16b\n .inst 0x6e9b9591  // udot v17.4s, v12.16b, v27.16b\n"
+    ".inst 0x6e8f9595  // udot v21.4s, v12.16b, v15.16b\n"
+    "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+    ".inst 0x6e9a9592  // udot v18.4s, v12.16b, v26.16b\n"
+    ".inst 0x6e9694dd  // udot v29.4s, v6.16b, v22.16b\n"
+    ".inst 0x6e96969f  // udot v31.4s, v20.16b, v22.16b\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+    ".inst 0x6e8f943e  // udot v30.4s, v1.16b, v15.16b\n"
+    ".inst 0x6e9a943c  // udot v28.4s, v1.16b, v26.16b\n"
+    "mls v31.4s, v21.4s, v16.4s\n"
+    ".inst 0x6e969592  // udot v18.4s, v12.16b, v22.16b\n"
+    ".inst 0x6e9b969d  // udot v29.4s, v20.16b, v27.16b\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x6e9a94de  // udot v30.4s, v6.16b, v26.16b\n"
+    "ldr q26, [%x[params], #0x10]\n"
+    ".inst 0x6e9694dc  // udot v28.4s, v6.16b, v22.16b\n"
+    "mls v29.4s, v17.4s, v16.4s\n"
+    "mov v21.16b, v18.16b\n .inst 0x6e9b9595  // udot v21.4s, v12.16b, v27.16b\n"
+    ".inst 0x6e8f9592  // udot v18.4s, v12.16b, v15.16b\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x6e96969e  // udot v30.4s, v20.16b, v22.16b\n"
+    ".inst 0x6e9b969c  // udot v28.4s, v20.16b, v27.16b\n"
+    "mls v30.4s, v18.4s, v16.4s\n"
+    "mls v28.4s, v21.4s, v16.4s\n"
+    "and v15.16b, v31.16b, v26.16b\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+    "ldr q1, [%x[params], #0x60]\n"
+    "sqadd v31.4s, v31.4s, v15.4s\n"
+    "and v18.16b, v30.16b, v26.16b\n"
+    "and v21.16b, v29.16b, v26.16b\n"
+    "and v17.16b, v28.16b, v26.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v26.4s\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "ldr q18, [%x[params], #0x40]\n"
+    "sqadd v29.4s, v29.4s, v21.4s\n"
+    "ldr q27, [%x[params], #0x50]\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "ldr q15, [%x[params], #0x30]\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v26.4s\n"
+    "srshl v29.4s, v29.4s, v26.4s\n"
+    "srshl v28.4s, v28.4s, v26.4s\n"
+    "ldr q20, [%x[params], #0x70]\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "movi v22.4s, #0x0\n"
+    ".inst 0x6e979596  // udot v22.4s, v12.16b, v23.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s31, [x25, x27]\n"
+    "ldr q26, [%x[params], #0x20]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    ".inst 0x6e939596  // udot v22.4s, v12.16b, v19.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s30, [x24, x27]\n"
+    "mov v6.16b, v22.16b\n .inst 0x6e989586  // udot v6.4s, v12.16b, v24.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s29, [x23, x27]\n"
+    "mov v30.16b, v26.16b\n"
+    ".inst 0x6e999596  // udot v22.4s, v12.16b, v25.16b\n"
+    "str s28, [x22, x27]\n"
+    "mov v29.16b, v26.16b\n"
+    "mov v21.16b, v26.16b\n"
+    ".inst 0x6e9995fa  // udot v26.4s, v15.16b, v25.16b\n"
+    ".inst 0x6e9795fd  // udot v29.4s, v15.16b, v23.16b\n"
+    ".inst 0x6e97965a  // udot v26.4s, v18.16b, v23.16b\n"
+    "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+    "movi v28.4s, #0x0\n"
+    ".inst 0x6e9995fe  // udot v30.4s, v15.16b, v25.16b\n"
+    ".inst 0x6e9795f5  // udot v21.4s, v15.16b, v23.16b\n"
+    ".inst 0x6e97959c  // udot v28.4s, v12.16b, v23.16b\n"
+    ".inst 0x6e93965d  // udot v29.4s, v18.16b, v19.16b\n"
+    ".inst 0x6e93977a  // udot v26.4s, v27.16b, v19.16b\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+    ".inst 0x6e97965e  // udot v30.4s, v18.16b, v23.16b\n"
+    "ldr q4, [x9, x28]\n"
+    ".inst 0x6e939655  // udot v21.4s, v18.16b, v19.16b\n"
+    "mls v26.4s, v22.4s, v16.4s\n"
+    ".inst 0x6e93959c  // udot v28.4s, v12.16b, v19.16b\n"
+    ".inst 0x6e98977d  // udot v29.4s, v27.16b, v24.16b\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x6e93977e  // udot v30.4s, v27.16b, v19.16b\n"
+    ".inst 0x6e989775  // udot v21.4s, v27.16b, v24.16b\n"
+    "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+    "mov v17.16b, v28.16b\n .inst 0x6e989591  // udot v17.4s, v12.16b, v24.16b\n"
+    ".inst 0x6e99959c  // udot v28.4s, v12.16b, v25.16b\n"
+    "ldr q31, [x14, x28]\n"
+    "mls v30.4s, v28.4s, v16.4s\n"
+    "mls v29.4s, v6.4s, v16.4s\n"
+    "mls v21.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v26.16b, v20.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+    "ldr q27, [%x[params], #0xc0]\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "and v18.16b, v30.16b, v20.16b\n"
+    "and v6.16b, v29.16b, v20.16b\n"
+    "and v17.16b, v21.16b, v20.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "ldr q28, [%x[params], #0xa0]\n"
+    "sqadd v29.4s, v29.4s, v6.4s\n"
+    "ldr q24, [%x[params], #0xb0]\n"
+    "sqadd v21.4s, v21.4s, v17.4s\n"
+    "ldr q15, [%x[params], #0x90]\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "srshl v29.4s, v29.4s, v20.4s\n"
+    "srshl v21.4s, v21.4s, v20.4s\n"
+    "ldr q1, [%x[params], #0xd0]\n"
+    "smax v26.4s, v26.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v11.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v21.4s, v21.4s, v13.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "movi v22.4s, #0x0\n"
+    ".inst 0x6e839596  // udot v22.4s, v12.16b, v3.16b\n"
+    ".inst 0x6e809596  // udot v22.4s, v12.16b, v0.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s26, [x25, x27]\n"
+    "ldr q26, [%x[params], #0x80]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "mov v18.16b, v22.16b\n .inst 0x6e829592  // udot v18.4s, v12.16b, v2.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s30, [x24, x27]\n"
+    ".inst 0x6e879596  // udot v22.4s, v12.16b, v7.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s29, [x23, x27]\n"
+    "mov v6.16b, v26.16b\n"
+    "str s21, [x22, x27]\n"
+    "mov v25.16b, v26.16b\n"
+    "mov v20.16b, v26.16b\n"
+    ".inst 0x6e8795fa  // udot v26.4s, v15.16b, v7.16b\n"
+    ".inst 0x6e8395f9  // udot v25.4s, v15.16b, v3.16b\n"
+    ".inst 0x6e83979a  // udot v26.4s, v28.16b, v3.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "movi v23.4s, #0x0\n"
+    ".inst 0x6e8795e6  // udot v6.4s, v15.16b, v7.16b\n"
+    ".inst 0x6e8395f4  // udot v20.4s, v15.16b, v3.16b\n"
+    ".inst 0x6e839597  // udot v23.4s, v12.16b, v3.16b\n"
+    ".inst 0x6e809799  // udot v25.4s, v28.16b, v0.16b\n"
+    ".inst 0x6e80971a  // udot v26.4s, v24.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x6e839786  // udot v6.4s, v28.16b, v3.16b\n"
+    "ldr q19, [x26, x28]\n"
+    ".inst 0x6e809794  // udot v20.4s, v28.16b, v0.16b\n"
+    "mls v26.4s, v22.4s, v16.4s\n"
+    ".inst 0x6e809597  // udot v23.4s, v12.16b, v0.16b\n"
+    ".inst 0x6e829719  // udot v25.4s, v24.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x6e809706  // udot v6.4s, v24.16b, v0.16b\n"
+    ".inst 0x6e829714  // udot v20.4s, v24.16b, v2.16b\n"
+    "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+    "mov v17.16b, v23.16b\n .inst 0x6e829591  // udot v17.4s, v12.16b, v2.16b\n"
+    ".inst 0x6e879597  // udot v23.4s, v12.16b, v7.16b\n"
+    "ldr q21, [x13, x28]\n"
+    "mls v6.4s, v23.4s, v16.4s\n"
+    "mls v25.4s, v18.4s, v16.4s\n"
+    "mls v20.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v26.16b, v1.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v6.4s, v6.4s, v27.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+    "ldr q15, [%x[params], #0x120]\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "and v18.16b, v6.16b, v1.16b\n"
+    "and v22.16b, v25.16b, v1.16b\n"
+    "and v17.16b, v20.16b, v1.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "ldr q30, [%x[params], #0x100]\n"
+    "sqadd v25.4s, v25.4s, v22.4s\n"
+    "ldr q27, [%x[params], #0x110]\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "ldr q24, [%x[params], #0xf0]\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "srshl v6.4s, v6.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v20.4s, v20.4s, v1.4s\n"
+    "ldr q23, [%x[params], #0x130]\n"
+    "smax v26.4s, v26.4s, v13.4s\n"
+    "add v6.4s, v6.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v11.4s\n"
+    "smax v6.4s, v6.4s, v13.4s\n"
+    "smax v25.4s, v25.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v13.4s\n"
+    "smin v6.4s, v6.4s, v11.4s\n"
+    "smin v25.4s, v25.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "movi v0.4s, #0x0\n"
+    ".inst 0x6e8a9580  // udot v0.4s, v12.16b, v10.16b\n"
+    ".inst 0x6e859580  // udot v0.4s, v12.16b, v5.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "str s26, [x25, x27]\n"
+    "ldr q28, [%x[params], #0xe0]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "mov v22.16b, v0.16b\n .inst 0x6e899596  // udot v22.4s, v12.16b, v9.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s6, [x24, x27]\n"
+    ".inst 0x6e889580  // udot v0.4s, v12.16b, v8.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s25, [x23, x27]\n"
+    "mov v29.16b, v28.16b\n"
+    "str s20, [x22, x27]\n"
+    "mov v25.16b, v28.16b\n"
+    "mov v7.16b, v28.16b\n"
+    ".inst 0x6e88971c  // udot v28.4s, v24.16b, v8.16b\n"
+    ".inst 0x6e8a9719  // udot v25.4s, v24.16b, v10.16b\n"
+    ".inst 0x6e8a97dc  // udot v28.4s, v30.16b, v10.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x6e88971d  // udot v29.4s, v24.16b, v8.16b\n"
+    ".inst 0x6e8a9707  // udot v7.4s, v24.16b, v10.16b\n"
+    ".inst 0x6e8a9591  // udot v17.4s, v12.16b, v10.16b\n"
+    ".inst 0x6e8597d9  // udot v25.4s, v30.16b, v5.16b\n"
+    ".inst 0x6e85977c  // udot v28.4s, v27.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x6e8a97dd  // udot v29.4s, v30.16b, v10.16b\n"
+    "ldr q10, [x21, x28]\n"
+    ".inst 0x6e8597c7  // udot v7.4s, v30.16b, v5.16b\n"
+    "mls v28.4s, v0.4s, v16.4s\n"
+    ".inst 0x6e859591  // udot v17.4s, v12.16b, v5.16b\n"
+    ".inst 0x6e899779  // udot v25.4s, v27.16b, v9.16b\n"
+    "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+    ".inst 0x6e85977d  // udot v29.4s, v27.16b, v5.16b\n"
+    ".inst 0x6e899767  // udot v7.4s, v27.16b, v9.16b\n"
+    "sqrdmulh v28.4s, v28.4s, v15.4s\n"
+    "mov v18.16b, v17.16b\n .inst 0x6e899592  // udot v18.4s, v12.16b, v9.16b\n"
+    ".inst 0x6e889591  // udot v17.4s, v12.16b, v8.16b\n"
+    "ldr q8, [x12, x28]\n"
+    "mls v29.4s, v17.4s, v16.4s\n"
+    "mls v25.4s, v22.4s, v16.4s\n"
+    "mls v7.4s, v18.4s, v16.4s\n"
+    "and v17.16b, v28.16b, v23.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v15.4s\n"
+    "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+    "ldr q15, [x15, x28]\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "ldp x21, x20, [%x[inptrs], #0x40]\n"
+    "ldr q22, [x21, x28]\n"
+    "ldr q3, [x20, x28]\n"
+    "and v24.16b, v29.16b, v23.16b\n"
+    "and v20.16b, v25.16b, v23.16b\n"
+    "and v17.16b, v7.16b, v23.16b\n"
+    "sshr v24.4s, v24.4s, #0x1f\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "ldr q2, [x21, x28]\n"
+    "ldr q5, [x20, x28]\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v28.4s, v28.4s, v23.4s\n"
+    "sqadd v29.4s, v29.4s, v24.4s\n"
+    "ldr q6, [%x[params], #0x160]\n"
+    "sqadd v25.4s, v25.4s, v20.4s\n"
+    "ldr q20, [%x[params], #0x170]\n"
+    "sqadd v7.4s, v7.4s, v17.4s\n"
+    "ldr q1, [%x[params], #0x150]\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "srshl v29.4s, v29.4s, v23.4s\n"
+    "srshl v25.4s, v25.4s, v23.4s\n"
+    "srshl v7.4s, v7.4s, v23.4s\n"
+    "ldr q26, [x10, x28]\n"
+    "ldp x21, x20, [%x[inptrs], #0x60]\n"
+    "ldr q27, [x21, x28]\n"
+    "ldr q30, [x20, x28]\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "add v7.4s, v7.4s, v14.4s\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "ldr q23, [x21, x28]\n"
+    "ldr q9, [x20, x28]\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "ldp x15, x14, [%x[inptrs], #0x0]\n"
+    "smax v25.4s, v25.4s, v13.4s\n"
+    "smax v7.4s, v7.4s, v13.4s\n"
+    "ldp x13, x12, [%x[inptrs], #0x10]\n"
+    "ldp x10, x9, [%x[inptrs], #0x20]\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v25.4s, v25.4s, v11.4s\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "smin v7.4s, v7.4s, v11.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s28, [x25, x27]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "zip2 v17.16b, v15.16b, v21.16b\n"
+    "zip1 v15.16b, v15.16b, v21.16b\n"
+    "zip1 v18.16b, v31.16b, v8.16b\n"
+    "zip2 v8.16b, v31.16b, v8.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s29, [x24, x27]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str s25, [x23, x27]\n"
+    "zip2 v25.16b, v15.16b, v18.16b\n"
+    "str s7, [x22, x27]\n"
+    "zip1 v15.16b, v15.16b, v18.16b\n"
+    "zip1 v7.16b, v17.16b, v8.16b\n"
+    "add x27, x27, #0x4\n"
+    "zip2 v8.16b, v17.16b, v8.16b\n"
+    "ldr q31, [%x[params], #0x140]\n"
+    "zip2 v29.16b, v26.16b, v19.16b\n"
+    "add %x[params], %x[params], #0x180\n"
+    "zip1 v26.16b, v26.16b, v19.16b\n"
+    "zip1 v28.16b, v4.16b, v10.16b\n"
+    "zip2 v10.16b, v4.16b, v10.16b\n"
+    "zip2 v24.16b, v22.16b, v2.16b\n"
+    "zip1 v22.16b, v22.16b, v2.16b\n"
+    "zip1 v21.16b, v3.16b, v5.16b\n"
+    "zip2 v5.16b, v3.16b, v5.16b\n"
+    "zip2 v18.16b, v27.16b, v23.16b\n"
+    "zip1 v27.16b, v27.16b, v23.16b\n"
+    "zip1 v17.16b, v30.16b, v9.16b\n"
+    "zip2 v9.16b, v30.16b, v9.16b\n"
+    "zip2 v23.16b, v26.16b, v28.16b\n"
+    "zip1 v26.16b, v26.16b, v28.16b\n"
+    "zip1 v3.16b, v29.16b, v10.16b\n"
+    "zip2 v10.16b, v29.16b, v10.16b\n"
+    "zip2 v19.16b, v22.16b, v21.16b\n"
+    "zip1 v22.16b, v22.16b, v21.16b\n"
+    "zip1 v0.16b, v24.16b, v5.16b\n"
+    "zip2 v5.16b, v24.16b, v5.16b\n"
+    "zip2 v24.16b, v27.16b, v17.16b\n"
+    "zip1 v27.16b, v27.16b, v17.16b\n"
+    "zip1 v2.16b, v18.16b, v9.16b\n"
+    "zip2 v9.16b, v18.16b, v9.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "bgt 1b\n"
+    "2:"  // Detached iteration
+    "movi v21.4s, #0x0\n"
+    ".inst 0x6e9a9595  // udot v21.4s, v12.16b, v26.16b\n"
+    ".inst 0x6e8f943f  // udot v31.4s, v1.16b, v15.16b\n"
+    "tst %x[n_channels], #0xf\n"
+    ".inst 0x6e969595  // udot v21.4s, v12.16b, v22.16b\n"
+    ".inst 0x6e9a943d  // udot v29.4s, v1.16b, v26.16b\n"
+    "movi v18.4s, #0x0\n"
+    "add x28, x28, #0x10\n"
+    ".inst 0x6e9a94df  // udot v31.4s, v6.16b, v26.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "mov v17.16b, v21.16b\n .inst 0x6e9b9591  // udot v17.4s, v12.16b, v27.16b\n"
+    ".inst 0x6e8f9595  // udot v21.4s, v12.16b, v15.16b\n"
+    "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+    ".inst 0x6e9a9592  // udot v18.4s, v12.16b, v26.16b\n"
+    ".inst 0x6e9694dd  // udot v29.4s, v6.16b, v22.16b\n"
+    ".inst 0x6e96969f  // udot v31.4s, v20.16b, v22.16b\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+    ".inst 0x6e8f943e  // udot v30.4s, v1.16b, v15.16b\n"
+    ".inst 0x6e9a943c  // udot v28.4s, v1.16b, v26.16b\n"
+    "mls v31.4s, v21.4s, v16.4s\n"
+    ".inst 0x6e969592  // udot v18.4s, v12.16b, v22.16b\n"
+    ".inst 0x6e9b969d  // udot v29.4s, v20.16b, v27.16b\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x6e9a94de  // udot v30.4s, v6.16b, v26.16b\n"
+    "ldr q4, [%x[params], #0x10]\n"
+    ".inst 0x6e9694dc  // udot v28.4s, v6.16b, v22.16b\n"
+    "mls v29.4s, v17.4s, v16.4s\n"
+    "mov v21.16b, v18.16b\n .inst 0x6e9b9595  // udot v21.4s, v12.16b, v27.16b\n"
+    ".inst 0x6e8f9592  // udot v18.4s, v12.16b, v15.16b\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x6e96969e  // udot v30.4s, v20.16b, v22.16b\n"
+    ".inst 0x6e9b969c  // udot v28.4s, v20.16b, v27.16b\n"
+    "mls v30.4s, v18.4s, v16.4s\n"
+    "mls v28.4s, v21.4s, v16.4s\n"
+    "and v27.16b, v31.16b, v4.16b\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+    "ldr q15, [%x[params], #0x60]\n"
+    "sqadd v31.4s, v31.4s, v27.4s\n"
+    "and v20.16b, v30.16b, v4.16b\n"
+    "and v18.16b, v29.16b, v4.16b\n"
+    "and v17.16b, v28.16b, v4.16b\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v4.4s\n"
+    "sqadd v30.4s, v30.4s, v20.4s\n"
+    "ldr q27, [%x[params], #0x40]\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "ldr q26, [%x[params], #0x50]\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "ldr q6, [%x[params], #0x30]\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v4.4s\n"
+    "srshl v29.4s, v29.4s, v4.4s\n"
+    "srshl v28.4s, v28.4s, v4.4s\n"
+    "ldr q4, [%x[params], #0x70]\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "movi v1.4s, #0x0\n"
+    ".inst 0x6e979581  // udot v1.4s, v12.16b, v23.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s31, [x25, x27]\n"
+    "ldr q31, [%x[params], #0x20]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    ".inst 0x6e939581  // udot v1.4s, v12.16b, v19.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s30, [x24, x27]\n"
+    "mov v22.16b, v1.16b\n .inst 0x6e989596  // udot v22.4s, v12.16b, v24.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s29, [x23, x27]\n"
+    "mov v29.16b, v31.16b\n"
+    ".inst 0x6e999581  // udot v1.4s, v12.16b, v25.16b\n"
+    "str s28, [x22, x27]\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    ".inst 0x6e9994df  // udot v31.4s, v6.16b, v25.16b\n"
+    ".inst 0x6e9794d5  // udot v21.4s, v6.16b, v23.16b\n"
+    ".inst 0x6e97977f  // udot v31.4s, v27.16b, v23.16b\n"
+    "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x6e9994dd  // udot v29.4s, v6.16b, v25.16b\n"
+    ".inst 0x6e9794d4  // udot v20.4s, v6.16b, v23.16b\n"
+    ".inst 0x6e979592  // udot v18.4s, v12.16b, v23.16b\n"
+    ".inst 0x6e939775  // udot v21.4s, v27.16b, v19.16b\n"
+    ".inst 0x6e93975f  // udot v31.4s, v26.16b, v19.16b\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+    ".inst 0x6e97977d  // udot v29.4s, v27.16b, v23.16b\n"
+    ".inst 0x6e939774  // udot v20.4s, v27.16b, v19.16b\n"
+    "mls v31.4s, v1.4s, v16.4s\n"
+    ".inst 0x6e939592  // udot v18.4s, v12.16b, v19.16b\n"
+    ".inst 0x6e989755  // udot v21.4s, v26.16b, v24.16b\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x6e93975d  // udot v29.4s, v26.16b, v19.16b\n"
+    ".inst 0x6e989754  // udot v20.4s, v26.16b, v24.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v15.4s\n"
+    "mov v17.16b, v18.16b\n .inst 0x6e989591  // udot v17.4s, v12.16b, v24.16b\n"
+    ".inst 0x6e999592  // udot v18.4s, v12.16b, v25.16b\n"
+    "mls v29.4s, v18.4s, v16.4s\n"
+    "mls v21.4s, v22.4s, v16.4s\n"
+    "mls v20.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v4.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v15.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v15.4s\n"
+    "ldr q27, [%x[params], #0xc0]\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v19.16b, v29.16b, v4.16b\n"
+    "and v18.16b, v21.16b, v4.16b\n"
+    "and v17.16b, v20.16b, v4.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v4.4s\n"
+    "sqadd v29.4s, v29.4s, v19.4s\n"
+    "ldr q26, [%x[params], #0xa0]\n"
+    "sqadd v21.4s, v21.4s, v18.4s\n"
+    "ldr q25, [%x[params], #0xb0]\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "ldr q24, [%x[params], #0x90]\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "srshl v29.4s, v29.4s, v4.4s\n"
+    "srshl v21.4s, v21.4s, v4.4s\n"
+    "srshl v20.4s, v20.4s, v4.4s\n"
+    "ldr q1, [%x[params], #0xd0]\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v21.4s, v21.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v13.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "movi v23.4s, #0x0\n"
+    ".inst 0x6e839597  // udot v23.4s, v12.16b, v3.16b\n"
+    ".inst 0x6e809597  // udot v23.4s, v12.16b, v0.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s31, [x25, x27]\n"
+    "ldr q31, [%x[params], #0x80]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "mov v22.16b, v23.16b\n .inst 0x6e829596  // udot v22.4s, v12.16b, v2.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s29, [x24, x27]\n"
+    ".inst 0x6e879597  // udot v23.4s, v12.16b, v7.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s21, [x23, x27]\n"
+    "mov v21.16b, v31.16b\n"
+    "str s20, [x22, x27]\n"
+    "mov v4.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    ".inst 0x6e87971f  // udot v31.4s, v24.16b, v7.16b\n"
+    ".inst 0x6e839704  // udot v4.4s, v24.16b, v3.16b\n"
+    ".inst 0x6e83975f  // udot v31.4s, v26.16b, v3.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x6e879715  // udot v21.4s, v24.16b, v7.16b\n"
+    ".inst 0x6e839714  // udot v20.4s, v24.16b, v3.16b\n"
+    ".inst 0x6e839592  // udot v18.4s, v12.16b, v3.16b\n"
+    ".inst 0x6e809744  // udot v4.4s, v26.16b, v0.16b\n"
+    ".inst 0x6e80973f  // udot v31.4s, v25.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x6e839755  // udot v21.4s, v26.16b, v3.16b\n"
+    ".inst 0x6e809754  // udot v20.4s, v26.16b, v0.16b\n"
+    "mls v31.4s, v23.4s, v16.4s\n"
+    ".inst 0x6e809592  // udot v18.4s, v12.16b, v0.16b\n"
+    ".inst 0x6e829724  // udot v4.4s, v25.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x6e809735  // udot v21.4s, v25.16b, v0.16b\n"
+    ".inst 0x6e829734  // udot v20.4s, v25.16b, v2.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v27.4s\n"
+    "mov v17.16b, v18.16b\n .inst 0x6e829591  // udot v17.4s, v12.16b, v2.16b\n"
+    ".inst 0x6e879592  // udot v18.4s, v12.16b, v7.16b\n"
+    "mls v21.4s, v18.4s, v16.4s\n"
+    "mls v4.4s, v22.4s, v16.4s\n"
+    "mls v20.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v1.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v27.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+    "ldr q30, [%x[params], #0x120]\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v19.16b, v21.16b, v1.16b\n"
+    "and v18.16b, v4.16b, v1.16b\n"
+    "and v17.16b, v20.16b, v1.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "sqadd v21.4s, v21.4s, v19.4s\n"
+    "ldr q29, [%x[params], #0x100]\n"
+    "sqadd v4.4s, v4.4s, v18.4s\n"
+    "ldr q28, [%x[params], #0x110]\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "ldr q27, [%x[params], #0xf0]\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "srshl v21.4s, v21.4s, v1.4s\n"
+    "srshl v4.4s, v4.4s, v1.4s\n"
+    "srshl v20.4s, v20.4s, v1.4s\n"
+    "ldr q26, [%x[params], #0x130]\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "add v4.4s, v4.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smax v21.4s, v21.4s, v13.4s\n"
+    "smax v4.4s, v4.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v13.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "smin v4.4s, v4.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "movi v25.4s, #0x0\n"
+    ".inst 0x6e8a9599  // udot v25.4s, v12.16b, v10.16b\n"
+    ".inst 0x6e859599  // udot v25.4s, v12.16b, v5.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s31, [x25, x27]\n"
+    "ldr q24, [%x[params], #0xe0]\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "mov v23.16b, v25.16b\n .inst 0x6e899597  // udot v23.4s, v12.16b, v9.16b\n"
+    "add %x[params], %x[params], #0x140\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "str s21, [x24, x27]\n"
+    ".inst 0x6e889599  // udot v25.4s, v12.16b, v8.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s4, [x23, x27]\n"
+    "mov v22.16b, v24.16b\n"
+    "str s20, [x22, x27]\n"
+    "mov v21.16b, v24.16b\n"
+    "mov v20.16b, v24.16b\n"
+    ".inst 0x6e889778  // udot v24.4s, v27.16b, v8.16b\n"
+    ".inst 0x6e8a9775  // udot v21.4s, v27.16b, v10.16b\n"
+    ".inst 0x6e8a97b8  // udot v24.4s, v29.16b, v10.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x6e889776  // udot v22.4s, v27.16b, v8.16b\n"
+    ".inst 0x6e8a9774  // udot v20.4s, v27.16b, v10.16b\n"
+    ".inst 0x6e8a9592  // udot v18.4s, v12.16b, v10.16b\n"
+    ".inst 0x6e8597b5  // udot v21.4s, v29.16b, v5.16b\n"
+    ".inst 0x6e859798  // udot v24.4s, v28.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x6e8a97b6  // udot v22.4s, v29.16b, v10.16b\n"
+    ".inst 0x6e8597b4  // udot v20.4s, v29.16b, v5.16b\n"
+    "mls v24.4s, v25.4s, v16.4s\n"
+    ".inst 0x6e859592  // udot v18.4s, v12.16b, v5.16b\n"
+    ".inst 0x6e899795  // udot v21.4s, v28.16b, v9.16b\n"
+    "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+    ".inst 0x6e859796  // udot v22.4s, v28.16b, v5.16b\n"
+    ".inst 0x6e899794  // udot v20.4s, v28.16b, v9.16b\n"
+    "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+    "mov v17.16b, v18.16b\n .inst 0x6e899591  // udot v17.4s, v12.16b, v9.16b\n"
+    ".inst 0x6e889592  // udot v18.4s, v12.16b, v8.16b\n"
+    "mls v22.4s, v18.4s, v16.4s\n"
+    "mls v21.4s, v23.4s, v16.4s\n"
+    "mls v20.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v24.16b, v26.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "and v19.16b, v22.16b, v26.16b\n"
+    "and v18.16b, v21.16b, v26.16b\n"
+    "and v17.16b, v20.16b, v26.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v22.4s, v22.4s, v19.4s\n"
+    "sqadd v21.4s, v21.4s, v18.4s\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "srshl v24.4s, v24.4s, v26.4s\n"
+    "srshl v22.4s, v22.4s, v26.4s\n"
+    "srshl v21.4s, v21.4s, v26.4s\n"
+    "srshl v20.4s, v20.4s, v26.4s\n"
+    "add v24.4s, v24.4s, v14.4s\n"
+    "add v22.4s, v22.4s, v14.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "smax v24.4s, v24.4s, v13.4s\n"
+    "smax v22.4s, v22.4s, v13.4s\n"
+    "smax v21.4s, v21.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v13.4s\n"
+    "smin v24.4s, v24.4s, v11.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x25, x27]\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s22, [x24, x27]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s21, [x23, x27]\n"
+    "str s20, [x22, x27]\n"
+    "add x27, x27, #0x4\n"
+    "beq 35f\n"
+    "3:"  // Oddments
+    "and x20, %x[n_channels], #0xf\n"
+    "add x15, x15, x28\n"
+    "add x14, x14, x28\n"
+    "add x13, x13, x28\n"
+    "add x12, x12, x28\n"
+    "add x10, x10, x28\n"
+    "add x9, x9, x28\n"
+    "add x26, x26, x28\n"
+    "add x21, x21, x28\n"
+    "tbz %x[n_channels], #3, 7f\n"
+    "ldr d15, [x15], #0x8\n"
+    "ldr d25, [x14], #0x8\n"
+    "ldr d7, [x13], #0x8\n"
+    "ldr d8, [x12], #0x8\n"
+    "ldr d26, [x10], #0x8\n"
+    "ldr d23, [x9], #0x8\n"
+    "ldr d3, [x26], #0x8\n"
+    "ldr d10, [x21], #0x8\n"
+    "tbz %x[n_channels], #2, 5f\n"
+    "ld1 { v15.s }[2], [x15], #0x4\n"
+    "ld1 { v25.s }[2], [x14], #0x4\n"
+    "ld1 { v7.s }[2], [x13], #0x4\n"
+    "ld1 { v8.s }[2], [x12], #0x4\n"
+    "ld1 { v26.s }[2], [x10], #0x4\n"
+    "ld1 { v23.s }[2], [x9], #0x4\n"
+    "ld1 { v3.s }[2], [x26], #0x4\n"
+    "ld1 { v10.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v15.h }[6], [x15], #0x2\n"
+    "ld1 { v25.h }[6], [x14], #0x2\n"
+    "ld1 { v7.h }[6], [x13], #0x2\n"
+    "ld1 { v8.h }[6], [x12], #0x2\n"
+    "ld1 { v26.h }[6], [x10], #0x2\n"
+    "ld1 { v23.h }[6], [x9], #0x2\n"
+    "ld1 { v3.h }[6], [x26], #0x2\n"
+    "ld1 { v10.h }[6], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[14], [x15], #0x1\n"
+    "ld1 { v25.b }[14], [x14], #0x1\n"
+    "ld1 { v7.b }[14], [x13], #0x1\n"
+    "ld1 { v8.b }[14], [x12], #0x1\n"
+    "ld1 { v26.b }[14], [x10], #0x1\n"
+    "ld1 { v23.b }[14], [x9], #0x1\n"
+    "ld1 { v3.b }[14], [x26], #0x1\n"
+    "ld1 { v10.b }[14], [x21], #0x1\n"
+    "b 11f\n"
+    "4:"  // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[12], [x15], #0x1\n"
+    "ld1 { v25.b }[12], [x14], #0x1\n"
+    "ld1 { v7.b }[12], [x13], #0x1\n"
+    "ld1 { v8.b }[12], [x12], #0x1\n"
+    "ld1 { v26.b }[12], [x10], #0x1\n"
+    "ld1 { v23.b }[12], [x9], #0x1\n"
+    "ld1 { v3.b }[12], [x26], #0x1\n"
+    "ld1 { v10.b }[12], [x21], #0x1\n"
+    "b 11f\n"
+    "5:"  // Oddments: Load (A): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v15.h }[4], [x15], #0x2\n"
+    "ld1 { v25.h }[4], [x14], #0x2\n"
+    "ld1 { v7.h }[4], [x13], #0x2\n"
+    "ld1 { v8.h }[4], [x12], #0x2\n"
+    "ld1 { v26.h }[4], [x10], #0x2\n"
+    "ld1 { v23.h }[4], [x9], #0x2\n"
+    "ld1 { v3.h }[4], [x26], #0x2\n"
+    "ld1 { v10.h }[4], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[10], [x15], #0x1\n"
+    "ld1 { v25.b }[10], [x14], #0x1\n"
+    "ld1 { v7.b }[10], [x13], #0x1\n"
+    "ld1 { v8.b }[10], [x12], #0x1\n"
+    "ld1 { v26.b }[10], [x10], #0x1\n"
+    "ld1 { v23.b }[10], [x9], #0x1\n"
+    "ld1 { v3.b }[10], [x26], #0x1\n"
+    "ld1 { v10.b }[10], [x21], #0x1\n"
+    "b 11f\n"
+    "6:"  // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[8], [x15], #0x1\n"
+    "ld1 { v25.b }[8], [x14], #0x1\n"
+    "ld1 { v7.b }[8], [x13], #0x1\n"
+    "ld1 { v8.b }[8], [x12], #0x1\n"
+    "ld1 { v26.b }[8], [x10], #0x1\n"
+    "ld1 { v23.b }[8], [x9], #0x1\n"
+    "ld1 { v3.b }[8], [x26], #0x1\n"
+    "ld1 { v10.b }[8], [x21], #0x1\n"
+    "b 11f\n"
+    "7:"  // Oddments: Load (A): Bit 3: Unset
+    "tbz %x[n_channels], #2, 9f\n"
+    "ldr s15, [x15], #0x4\n"
+    "ldr s25, [x14], #0x4\n"
+    "ldr s7, [x13], #0x4\n"
+    "ldr s8, [x12], #0x4\n"
+    "ldr s26, [x10], #0x4\n"
+    "ldr s23, [x9], #0x4\n"
+    "ldr s3, [x26], #0x4\n"
+    "ldr s10, [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v15.h }[2], [x15], #0x2\n"
+    "ld1 { v25.h }[2], [x14], #0x2\n"
+    "ld1 { v7.h }[2], [x13], #0x2\n"
+    "ld1 { v8.h }[2], [x12], #0x2\n"
+    "ld1 { v26.h }[2], [x10], #0x2\n"
+    "ld1 { v23.h }[2], [x9], #0x2\n"
+    "ld1 { v3.h }[2], [x26], #0x2\n"
+    "ld1 { v10.h }[2], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[6], [x15], #0x1\n"
+    "ld1 { v25.b }[6], [x14], #0x1\n"
+    "ld1 { v7.b }[6], [x13], #0x1\n"
+    "ld1 { v8.b }[6], [x12], #0x1\n"
+    "ld1 { v26.b }[6], [x10], #0x1\n"
+    "ld1 { v23.b }[6], [x9], #0x1\n"
+    "ld1 { v3.b }[6], [x26], #0x1\n"
+    "ld1 { v10.b }[6], [x21], #0x1\n"
+    "b 11f\n"
+    "8:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[4], [x15], #0x1\n"
+    "ld1 { v25.b }[4], [x14], #0x1\n"
+    "ld1 { v7.b }[4], [x13], #0x1\n"
+    "ld1 { v8.b }[4], [x12], #0x1\n"
+    "ld1 { v26.b }[4], [x10], #0x1\n"
+    "ld1 { v23.b }[4], [x9], #0x1\n"
+    "ld1 { v3.b }[4], [x26], #0x1\n"
+    "ld1 { v10.b }[4], [x21], #0x1\n"
+    "b 11f\n"
+    "9:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 10f\n"
+    "ldr h15, [x15], #0x2\n"
+    "ldr h25, [x14], #0x2\n"
+    "ldr h7, [x13], #0x2\n"
+    "ldr h8, [x12], #0x2\n"
+    "ldr h26, [x10], #0x2\n"
+    "ldr h23, [x9], #0x2\n"
+    "ldr h3, [x26], #0x2\n"
+    "ldr h10, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[2], [x15], #0x1\n"
+    "ld1 { v25.b }[2], [x14], #0x1\n"
+    "ld1 { v7.b }[2], [x13], #0x1\n"
+    "ld1 { v8.b }[2], [x12], #0x1\n"
+    "ld1 { v26.b }[2], [x10], #0x1\n"
+    "ld1 { v23.b }[2], [x9], #0x1\n"
+    "ld1 { v3.b }[2], [x26], #0x1\n"
+    "ld1 { v10.b }[2], [x21], #0x1\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "ldr b15, [x15], #0x1\n"
+    "ldr b25, [x14], #0x1\n"
+    "ldr b7, [x13], #0x1\n"
+    "ldr b8, [x12], #0x1\n"
+    "ldr b26, [x10], #0x1\n"
+    "ldr b23, [x9], #0x1\n"
+    "ldr b3, [x26], #0x1\n"
+    "ldr b10, [x21], #0x1\n"
+    "11:"  // Oddments: Load (A): Bit 3: End
+    "ldp x15, x14, [%x[inptrs], #0x40]\n"
+    "ldp x13, x12, [%x[inptrs], #0x50]\n"
+    "add x15, x15, x28\n"
+    "add x14, x14, x28\n"
+    "ldp x10, x9, [%x[inptrs], #0x60]\n"
+    "ldp x26, x21, [%x[inptrs], #0x70]\n"
+    "add x13, x13, x28\n"
+    "add x12, x12, x28\n"
+    "add x10, x10, x28\n"
+    "add x9, x9, x28\n"
+    "add x26, x26, x28\n"
+    "add x21, x21, x28\n"
+    "tbz %x[n_channels], #3, 15f\n"
+    "ldr d22, [x15], #0x8\n"
+    "ldr d19, [x14], #0x8\n"
+    "ldr d0, [x13], #0x8\n"
+    "ldr d5, [x12], #0x8\n"
+    "ldr d27, [x10], #0x8\n"
+    "ldr d24, [x9], #0x8\n"
+    "ldr d2, [x26], #0x8\n"
+    "ldr d9, [x21], #0x8\n"
+    "tbz %x[n_channels], #2, 13f\n"
+    "ld1 { v22.s }[2], [x15], #0x4\n"
+    "ld1 { v19.s }[2], [x14], #0x4\n"
+    "ld1 { v0.s }[2], [x13], #0x4\n"
+    "ld1 { v5.s }[2], [x12], #0x4\n"
+    "ld1 { v27.s }[2], [x10], #0x4\n"
+    "ld1 { v24.s }[2], [x9], #0x4\n"
+    "ld1 { v2.s }[2], [x26], #0x4\n"
+    "ld1 { v9.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v22.h }[6], [x15], #0x2\n"
+    "ld1 { v19.h }[6], [x14], #0x2\n"
+    "ld1 { v0.h }[6], [x13], #0x2\n"
+    "ld1 { v5.h }[6], [x12], #0x2\n"
+    "ld1 { v27.h }[6], [x10], #0x2\n"
+    "ld1 { v24.h }[6], [x9], #0x2\n"
+    "ld1 { v2.h }[6], [x26], #0x2\n"
+    "ld1 { v9.h }[6], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[14], [x15], #0x1\n"
+    "ld1 { v19.b }[14], [x14], #0x1\n"
+    "ld1 { v0.b }[14], [x13], #0x1\n"
+    "ld1 { v5.b }[14], [x12], #0x1\n"
+    "ld1 { v27.b }[14], [x10], #0x1\n"
+    "ld1 { v24.b }[14], [x9], #0x1\n"
+    "ld1 { v2.b }[14], [x26], #0x1\n"
+    "ld1 { v9.b }[14], [x21], #0x1\n"
+    "b 19f\n"
+    "12:"  // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[12], [x15], #0x1\n"
+    "ld1 { v19.b }[12], [x14], #0x1\n"
+    "ld1 { v0.b }[12], [x13], #0x1\n"
+    "ld1 { v5.b }[12], [x12], #0x1\n"
+    "ld1 { v27.b }[12], [x10], #0x1\n"
+    "ld1 { v24.b }[12], [x9], #0x1\n"
+    "ld1 { v2.b }[12], [x26], #0x1\n"
+    "ld1 { v9.b }[12], [x21], #0x1\n"
+    "b 19f\n"
+    "13:"  // Oddments: Load (B): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v22.h }[4], [x15], #0x2\n"
+    "ld1 { v19.h }[4], [x14], #0x2\n"
+    "ld1 { v0.h }[4], [x13], #0x2\n"
+    "ld1 { v5.h }[4], [x12], #0x2\n"
+    "ld1 { v27.h }[4], [x10], #0x2\n"
+    "ld1 { v24.h }[4], [x9], #0x2\n"
+    "ld1 { v2.h }[4], [x26], #0x2\n"
+    "ld1 { v9.h }[4], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[10], [x15], #0x1\n"
+    "ld1 { v19.b }[10], [x14], #0x1\n"
+    "ld1 { v0.b }[10], [x13], #0x1\n"
+    "ld1 { v5.b }[10], [x12], #0x1\n"
+    "ld1 { v27.b }[10], [x10], #0x1\n"
+    "ld1 { v24.b }[10], [x9], #0x1\n"
+    "ld1 { v2.b }[10], [x26], #0x1\n"
+    "ld1 { v9.b }[10], [x21], #0x1\n"
+    "b 19f\n"
+    "14:"  // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[8], [x15], #0x1\n"
+    "ld1 { v19.b }[8], [x14], #0x1\n"
+    "ld1 { v0.b }[8], [x13], #0x1\n"
+    "ld1 { v5.b }[8], [x12], #0x1\n"
+    "ld1 { v27.b }[8], [x10], #0x1\n"
+    "ld1 { v24.b }[8], [x9], #0x1\n"
+    "ld1 { v2.b }[8], [x26], #0x1\n"
+    "ld1 { v9.b }[8], [x21], #0x1\n"
+    "b 19f\n"
+    "15:"  // Oddments: Load (B): Bit 3: Unset
+    "tbz %x[n_channels], #2, 17f\n"
+    "ldr s22, [x15], #0x4\n"
+    "ldr s19, [x14], #0x4\n"
+    "ldr s0, [x13], #0x4\n"
+    "ldr s5, [x12], #0x4\n"
+    "ldr s27, [x10], #0x4\n"
+    "ldr s24, [x9], #0x4\n"
+    "ldr s2, [x26], #0x4\n"
+    "ldr s9, [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v22.h }[2], [x15], #0x2\n"
+    "ld1 { v19.h }[2], [x14], #0x2\n"
+    "ld1 { v0.h }[2], [x13], #0x2\n"
+    "ld1 { v5.h }[2], [x12], #0x2\n"
+    "ld1 { v27.h }[2], [x10], #0x2\n"
+    "ld1 { v24.h }[2], [x9], #0x2\n"
+    "ld1 { v2.h }[2], [x26], #0x2\n"
+    "ld1 { v9.h }[2], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[6], [x15], #0x1\n"
+    "ld1 { v19.b }[6], [x14], #0x1\n"
+    "ld1 { v0.b }[6], [x13], #0x1\n"
+    "ld1 { v5.b }[6], [x12], #0x1\n"
+    "ld1 { v27.b }[6], [x10], #0x1\n"
+    "ld1 { v24.b }[6], [x9], #0x1\n"
+    "ld1 { v2.b }[6], [x26], #0x1\n"
+    "ld1 { v9.b }[6], [x21], #0x1\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[4], [x15], #0x1\n"
+    "ld1 { v19.b }[4], [x14], #0x1\n"
+    "ld1 { v0.b }[4], [x13], #0x1\n"
+    "ld1 { v5.b }[4], [x12], #0x1\n"
+    "ld1 { v27.b }[4], [x10], #0x1\n"
+    "ld1 { v24.b }[4], [x9], #0x1\n"
+    "ld1 { v2.b }[4], [x26], #0x1\n"
+    "ld1 { v9.b }[4], [x21], #0x1\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ldr h22, [x15], #0x2\n"
+    "ldr h19, [x14], #0x2\n"
+    "ldr h0, [x13], #0x2\n"
+    "ldr h5, [x12], #0x2\n"
+    "ldr h27, [x10], #0x2\n"
+    "ldr h24, [x9], #0x2\n"
+    "ldr h2, [x26], #0x2\n"
+    "ldr h9, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[2], [x15], #0x1\n"
+    "ld1 { v19.b }[2], [x14], #0x1\n"
+    "ld1 { v0.b }[2], [x13], #0x1\n"
+    "ld1 { v5.b }[2], [x12], #0x1\n"
+    "ld1 { v27.b }[2], [x10], #0x1\n"
+    "ld1 { v24.b }[2], [x9], #0x1\n"
+    "ld1 { v2.b }[2], [x26], #0x1\n"
+    "ld1 { v9.b }[2], [x21], #0x1\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "ldr b22, [x15], #0x1\n"
+    "ldr b19, [x14], #0x1\n"
+    "ldr b0, [x13], #0x1\n"
+    "ldr b5, [x12], #0x1\n"
+    "ldr b27, [x10], #0x1\n"
+    "ldr b24, [x9], #0x1\n"
+    "ldr b2, [x26], #0x1\n"
+    "ldr b9, [x21], #0x1\n"
+    "19:"  // Oddments: Load (B): Bit 3: End
+    "ldr q20, [%x[params], #0x10]\n"
+    "ldr q6, [%x[params], #0x20]\n"
+    "zip2 v1.16b, v26.16b, v3.16b\n"
+    "zip1 v26.16b, v26.16b, v3.16b\n"
+    "ldr q4, [%x[params], #0x30]\n"
+    "zip1 v18.16b, v23.16b, v10.16b\n"
+    "zip2 v30.16b, v15.16b, v7.16b\n"
+    "cmp x20, #0x4\n"
+    "zip1 v15.16b, v15.16b, v7.16b\n"
+    "zip1 v29.16b, v25.16b, v8.16b\n"
+    "zip2 v8.16b, v25.16b, v8.16b\n"
+    "zip2 v10.16b, v23.16b, v10.16b\n"
+    "zip2 v23.16b, v26.16b, v18.16b\n"
+    "zip1 v26.16b, v26.16b, v18.16b\n"
+    "zip2 v28.16b, v22.16b, v0.16b\n"
+    "zip1 v22.16b, v22.16b, v0.16b\n"
+    "zip1 v21.16b, v19.16b, v5.16b\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x6e9a9591  // udot v17.4s, v12.16b, v26.16b\n"
+    "zip2 v25.16b, v15.16b, v29.16b\n"
+    "zip1 v15.16b, v15.16b, v29.16b\n"
+    "zip1 v7.16b, v30.16b, v8.16b\n"
+    "zip2 v8.16b, v30.16b, v8.16b\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "zip2 v5.16b, v19.16b, v5.16b\n"
+    "zip2 v30.16b, v27.16b, v2.16b\n"
+    "zip1 v27.16b, v27.16b, v2.16b\n"
+    "zip1 v18.16b, v24.16b, v9.16b\n"
+    "zip2 v9.16b, v24.16b, v9.16b\n"
+    "zip2 v19.16b, v22.16b, v21.16b\n"
+    "zip1 v22.16b, v22.16b, v21.16b\n"
+    "zip1 v3.16b, v1.16b, v10.16b\n"
+    ".inst 0x6e969591  // udot v17.4s, v12.16b, v22.16b\n"
+    "zip2 v10.16b, v1.16b, v10.16b\n"
+    "zip1 v0.16b, v28.16b, v5.16b\n"
+    "zip2 v5.16b, v28.16b, v5.16b\n"
+    "zip2 v24.16b, v27.16b, v18.16b\n"
+    "zip1 v27.16b, v27.16b, v18.16b\n"
+    "zip1 v2.16b, v30.16b, v9.16b\n"
+    "mov v18.16b, v17.16b\n .inst 0x6e9b9592  // udot v18.4s, v12.16b, v27.16b\n"
+    "zip2 v9.16b, v30.16b, v9.16b\n"
+    "mov v30.16b, v31.16b\n"
+    ".inst 0x6e8f9591  // udot v17.4s, v12.16b, v15.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    ".inst 0x6e8f969f  // udot v31.4s, v20.16b, v15.16b\n"
+    ".inst 0x6e9a969d  // udot v29.4s, v20.16b, v26.16b\n"
+    ".inst 0x6e9a94df  // udot v31.4s, v6.16b, v26.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "movi v1.4s, #0x0\n"
+    "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+    ".inst 0x6e9a9581  // udot v1.4s, v12.16b, v26.16b\n"
+    ".inst 0x6e9694dd  // udot v29.4s, v6.16b, v22.16b\n"
+    ".inst 0x6e96949f  // udot v31.4s, v4.16b, v22.16b\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+    ".inst 0x6e8f969e  // udot v30.4s, v20.16b, v15.16b\n"
+    ".inst 0x6e9a969c  // udot v28.4s, v20.16b, v26.16b\n"
+    "mls v31.4s, v17.4s, v16.4s\n"
+    ".inst 0x6e969581  // udot v1.4s, v12.16b, v22.16b\n"
+    ".inst 0x6e9b949d  // udot v29.4s, v4.16b, v27.16b\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x6e9a94de  // udot v30.4s, v6.16b, v26.16b\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    ".inst 0x6e9694dc  // udot v28.4s, v6.16b, v22.16b\n"
+    "mls v29.4s, v18.4s, v16.4s\n"
+    "mov v20.16b, v1.16b\n .inst 0x6e9b9594  // udot v20.4s, v12.16b, v27.16b\n"
+    ".inst 0x6e8f9581  // udot v1.4s, v12.16b, v15.16b\n"
+    "ldr q18, [%x[params], #0x40]\n"
+    "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+    ".inst 0x6e96949e  // udot v30.4s, v4.16b, v22.16b\n"
+    ".inst 0x6e9b949c  // udot v28.4s, v4.16b, v27.16b\n"
+    "mls v30.4s, v1.4s, v16.4s\n"
+    "add %x[params], %x[params], #0x60\n"
+    "mls v28.4s, v20.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v18.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v18.4s\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v17.16b, v30.16b, v21.16b\n"
+    "and v18.16b, v29.16b, v21.16b\n"
+    "and v26.16b, v28.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v28.4s, v28.4s, v26.4s\n"
+    "srshl v31.4s, v31.4s, v21.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v29.4s, v29.4s, v21.4s\n"
+    "srshl v28.4s, v28.4s, v21.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "blt 20f\n"
+    "str s31, [x25, x27]\n"
+    "str s30, [x24, x27]\n"
+    "str s29, [x23, x27]\n"
+    "str s28, [x22, x27]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Unroll 0: Oddment store
+    "add x25, x25, x27\n"
+    "add x24, x24, x27\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "tbz x20, #1, 21f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v29.h }[0], [x23], #0x2\n"
+    "st1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 22f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v29.b }[2], [x23], #0x1\n"
+    "st1 { v28.b }[2], [x22], #0x1\n"
+    "b 22f\n"
+    "21:"  // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v29.b }[0], [x23], #0x1\n"
+    "st1 { v28.b }[0], [x22], #0x1\n"
+    "22:"  // Oddments: Unroll 0: Oddment store: Bit 1: End
+    "23:"  // Oddments: Unroll 0: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x27, x27, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q27, [%x[params], #0x10]\n"
+    "movi v1.4s, #0x0\n"
+    ".inst 0x6e979581  // udot v1.4s, v12.16b, v23.16b\n"
+    "ldr q26, [%x[params], #0x20]\n"
+    "ldr q22, [%x[params], #0x30]\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "ldr q4, [%x[params], #0x40]\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    "mov v28.16b, v31.16b\n"
+    ".inst 0x6e99977f  // udot v31.4s, v27.16b, v25.16b\n"
+    ".inst 0x6e939581  // udot v1.4s, v12.16b, v19.16b\n"
+    ".inst 0x6e97977d  // udot v29.4s, v27.16b, v23.16b\n"
+    "movi v20.4s, #0x0\n"
+    "cmp x20, #0x4\n"
+    ".inst 0x6e97975f  // udot v31.4s, v26.16b, v23.16b\n"
+    "mov v18.16b, v1.16b\n .inst 0x6e989592  // udot v18.4s, v12.16b, v24.16b\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x6e999581  // udot v1.4s, v12.16b, v25.16b\n"
+    "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+    ".inst 0x6e99977e  // udot v30.4s, v27.16b, v25.16b\n"
+    ".inst 0x6e97977c  // udot v28.4s, v27.16b, v23.16b\n"
+    ".inst 0x6e979594  // udot v20.4s, v12.16b, v23.16b\n"
+    ".inst 0x6e93975d  // udot v29.4s, v26.16b, v19.16b\n"
+    ".inst 0x6e9396df  // udot v31.4s, v22.16b, v19.16b\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+    ".inst 0x6e97975e  // udot v30.4s, v26.16b, v23.16b\n"
+    ".inst 0x6e93975c  // udot v28.4s, v26.16b, v19.16b\n"
+    "mls v31.4s, v1.4s, v16.4s\n"
+    ".inst 0x6e939594  // udot v20.4s, v12.16b, v19.16b\n"
+    ".inst 0x6e9896dd  // udot v29.4s, v22.16b, v24.16b\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x6e9396de  // udot v30.4s, v22.16b, v19.16b\n"
+    ".inst 0x6e9896dc  // udot v28.4s, v22.16b, v24.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+    "mov v17.16b, v20.16b\n .inst 0x6e989591  // udot v17.4s, v12.16b, v24.16b\n"
+    ".inst 0x6e999594  // udot v20.4s, v12.16b, v25.16b\n"
+    "mls v30.4s, v20.4s, v16.4s\n"
+    "mls v29.4s, v18.4s, v16.4s\n"
+    "mls v28.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v19.16b, v30.16b, v21.16b\n"
+    "and v18.16b, v29.16b, v21.16b\n"
+    "and v17.16b, v28.16b, v21.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "srshl v31.4s, v31.4s, v21.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v29.4s, v29.4s, v21.4s\n"
+    "srshl v28.4s, v28.4s, v21.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "blt 24f\n"
+    "str s31, [x25, x27]\n"
+    "str s30, [x24, x27]\n"
+    "str s29, [x23, x27]\n"
+    "str s28, [x22, x27]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Unroll 1: Oddment store
+    "add x25, x25, x27\n"
+    "add x24, x24, x27\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "tbz x20, #1, 25f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v29.h }[0], [x23], #0x2\n"
+    "st1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 26f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v29.b }[2], [x23], #0x1\n"
+    "st1 { v28.b }[2], [x22], #0x1\n"
+    "b 26f\n"
+    "25:"  // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v29.b }[0], [x23], #0x1\n"
+    "st1 { v28.b }[0], [x22], #0x1\n"
+    "26:"  // Oddments: Unroll 1: Oddment store: Bit 1: End
+    "27:"  // Oddments: Unroll 1: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x27, x27, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q25, [%x[params], #0x10]\n"
+    "movi v24.4s, #0x0\n"
+    ".inst 0x6e839598  // udot v24.4s, v12.16b, v3.16b\n"
+    "ldr q23, [%x[params], #0x20]\n"
+    "ldr q22, [%x[params], #0x30]\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "ldr q21, [%x[params], #0x40]\n"
+    "ldr q20, [%x[params], #0x50]\n"
+    "mov v28.16b, v31.16b\n"
+    ".inst 0x6e87973f  // udot v31.4s, v25.16b, v7.16b\n"
+    ".inst 0x6e809598  // udot v24.4s, v12.16b, v0.16b\n"
+    ".inst 0x6e83973d  // udot v29.4s, v25.16b, v3.16b\n"
+    "movi v19.4s, #0x0\n"
+    "cmp x20, #0x4\n"
+    ".inst 0x6e8396ff  // udot v31.4s, v23.16b, v3.16b\n"
+    "mov v18.16b, v24.16b\n .inst 0x6e829592  // udot v18.4s, v12.16b, v2.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x6e879598  // udot v24.4s, v12.16b, v7.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    ".inst 0x6e87973e  // udot v30.4s, v25.16b, v7.16b\n"
+    ".inst 0x6e83973c  // udot v28.4s, v25.16b, v3.16b\n"
+    ".inst 0x6e839593  // udot v19.4s, v12.16b, v3.16b\n"
+    ".inst 0x6e8096fd  // udot v29.4s, v23.16b, v0.16b\n"
+    ".inst 0x6e8096df  // udot v31.4s, v22.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x6e8396fe  // udot v30.4s, v23.16b, v3.16b\n"
+    ".inst 0x6e8096fc  // udot v28.4s, v23.16b, v0.16b\n"
+    "mls v31.4s, v24.4s, v16.4s\n"
+    ".inst 0x6e809593  // udot v19.4s, v12.16b, v0.16b\n"
+    ".inst 0x6e8296dd  // udot v29.4s, v22.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x6e8096de  // udot v30.4s, v22.16b, v0.16b\n"
+    ".inst 0x6e8296dc  // udot v28.4s, v22.16b, v2.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+    "mov v17.16b, v19.16b\n .inst 0x6e829591  // udot v17.4s, v12.16b, v2.16b\n"
+    ".inst 0x6e879593  // udot v19.4s, v12.16b, v7.16b\n"
+    "mls v30.4s, v19.4s, v16.4s\n"
+    "mls v29.4s, v18.4s, v16.4s\n"
+    "mls v28.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v20.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v19.16b, v30.16b, v20.16b\n"
+    "and v18.16b, v29.16b, v20.16b\n"
+    "and v17.16b, v28.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "srshl v29.4s, v29.4s, v20.4s\n"
+    "srshl v28.4s, v28.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "blt 28f\n"
+    "str s31, [x25, x27]\n"
+    "str s30, [x24, x27]\n"
+    "str s29, [x23, x27]\n"
+    "str s28, [x22, x27]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Unroll 2: Oddment store
+    "add x25, x25, x27\n"
+    "add x24, x24, x27\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "tbz x20, #1, 29f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v29.h }[0], [x23], #0x2\n"
+    "st1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 30f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v29.b }[2], [x23], #0x1\n"
+    "st1 { v28.b }[2], [x22], #0x1\n"
+    "b 30f\n"
+    "29:"  // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v29.b }[0], [x23], #0x1\n"
+    "st1 { v28.b }[0], [x22], #0x1\n"
+    "30:"  // Oddments: Unroll 2: Oddment store: Bit 1: End
+    "31:"  // Oddments: Unroll 2: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x27, x27, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q23, [%x[params], #0x10]\n"
+    "movi v22.4s, #0x0\n"
+    ".inst 0x6e8a9596  // udot v22.4s, v12.16b, v10.16b\n"
+    "ldr q21, [%x[params], #0x20]\n"
+    "ldr q19, [%x[params], #0x30]\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "ldr q20, [%x[params], #0x40]\n"
+    "ldr q26, [%x[params], #0x50]\n"
+    "mov v28.16b, v31.16b\n"
+    ".inst 0x6e8896ff  // udot v31.4s, v23.16b, v8.16b\n"
+    ".inst 0x6e859596  // udot v22.4s, v12.16b, v5.16b\n"
+    ".inst 0x6e8a96fd  // udot v29.4s, v23.16b, v10.16b\n"
+    "movi v18.4s, #0x0\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x6e8a96bf  // udot v31.4s, v21.16b, v10.16b\n"
+    "mov v17.16b, v22.16b\n .inst 0x6e899591  // udot v17.4s, v12.16b, v9.16b\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    ".inst 0x6e889596  // udot v22.4s, v12.16b, v8.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    ".inst 0x6e8896fe  // udot v30.4s, v23.16b, v8.16b\n"
+    ".inst 0x6e8a96fc  // udot v28.4s, v23.16b, v10.16b\n"
+    ".inst 0x6e8a9592  // udot v18.4s, v12.16b, v10.16b\n"
+    ".inst 0x6e8596bd  // udot v29.4s, v21.16b, v5.16b\n"
+    ".inst 0x6e85967f  // udot v31.4s, v19.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x6e8a96be  // udot v30.4s, v21.16b, v10.16b\n"
+    ".inst 0x6e8596bc  // udot v28.4s, v21.16b, v5.16b\n"
+    "mls v31.4s, v22.4s, v16.4s\n"
+    ".inst 0x6e859592  // udot v18.4s, v12.16b, v5.16b\n"
+    ".inst 0x6e89967d  // udot v29.4s, v19.16b, v9.16b\n"
+    "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+    ".inst 0x6e85967e  // udot v30.4s, v19.16b, v5.16b\n"
+    ".inst 0x6e89967c  // udot v28.4s, v19.16b, v9.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v20.4s\n"
+    "mov v7.16b, v18.16b\n .inst 0x6e899587  // udot v7.4s, v12.16b, v9.16b\n"
+    ".inst 0x6e889592  // udot v18.4s, v12.16b, v8.16b\n"
+    "mls v30.4s, v18.4s, v16.4s\n"
+    "mls v29.4s, v17.4s, v16.4s\n"
+    "mls v28.4s, v7.4s, v16.4s\n"
+    "and v16.16b, v31.16b, v26.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v20.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v20.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "and v18.16b, v30.16b, v26.16b\n"
+    "and v17.16b, v29.16b, v26.16b\n"
+    "and v16.16b, v28.16b, v26.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "sqadd v29.4s, v29.4s, v17.4s\n"
+    "sqadd v28.4s, v28.4s, v16.4s\n"
+    "srshl v31.4s, v31.4s, v26.4s\n"
+    "srshl v30.4s, v30.4s, v26.4s\n"
+    "srshl v29.4s, v29.4s, v26.4s\n"
+    "srshl v28.4s, v28.4s, v26.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "32:"  // Oddments: Unroll 3: Oddment store
+    "add x25, x25, x27\n"
+    "add x24, x24, x27\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "tbz x20, #1, 33f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v29.h }[0], [x23], #0x2\n"
+    "st1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 34f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v29.b }[2], [x23], #0x1\n"
+    "st1 { v28.b }[2], [x22], #0x1\n"
+    "b 34f\n"
+    "33:"  // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v29.b }[0], [x23], #0x1\n"
+    "st1 { v28.b }[0], [x22], #0x1\n"
+    "34:"  // Oddments: Unroll 3: Oddment store: Bit 1: End
+    "35:"  // End
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..5ae0be1054
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const uint8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d5b55cb9c5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1166 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x8, x7, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v14.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v19.16b }, [x21]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v29.8h }, [x21]\n"
+    "ld1r { v12.8h }, [x20]\n"
+    "mov x17, #0x0\n"
+    "mov x16, #0x0\n"
+    "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x11, x10, [x22, #0x0]\n"
+    "ldp x9, x28, [x22, #0x10]\n"
+    "cbz x8, 3f\n"
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "subs x8, x8, #0x1\n"
+    "usubl v23.8h, v23.8b, v19.8b\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "usubl v16.8h, v16.8b, v19.8b\n"
+    "usubl v1.8h, v1.8b, v19.8b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "usubl v5.8h, v5.8b, v19.8b\n"
+    "usubl v26.8h, v26.8b, v19.8b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "usubl v18.8h, v18.8b, v19.8b\n"
+    "usubl v31.8h, v31.8b, v19.8b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "usubl v25.8h, v25.8b, v19.8b\n"
+    "usubl v20.8h, v20.8b, v19.8b\n"
+    "ldr q9, [x20, #0x0]\n"
+    "ldr q24, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x23, x22, [x15, #0x0]\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d22, [x23, x17]\n"
+    "ldr d4, [x22, x17]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d8, [x21, x17]\n"
+    "ldr d27, [x20, x17]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ldr d15, [x20, x17]\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr q3, [x13, #0x0]\n"
+    "ldr q17, [x12, #0x0]\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr q21, [x13, #0x10]\n"
+    "ldr q28, [x12, #0x10]\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "ldr d11, [x20, x17]\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "ldr x20, [x15, #0x38]\n"
+    "ldr d4, [x20, x17]\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "usubl v11.8h, v11.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "ldr x27, [x15, #0x48]\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x26, [x15, #0x50]\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "ldr d8, [x20, x17]\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "ldr x25, [x15, #0x58]\n"
+    "ldr x24, [x15, #0x60]\n"
+    "smlal v2.4s, v11.4h, v31.4h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x70]\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "smlal v9.4s, v4.4h, v16.4h\n"
+    "ldr x21, [x15, #0x78]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "ldr d27, [x27, x17]\n"
+    "smlal2 v30.4s, v11.8h, v31.8h\n"
+    "ldr d11, [x26, x17]\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "add x14, x14, #0x48\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal v10.4s, v22.4h, v20.4h\n"
+    "usubl v11.8h, v11.8b, v14.8b\n"
+    "subs x8, x8, #0x1\n"
+    "smlal2 v24.4s, v4.8h, v16.8h\n"
+    "smlal v9.4s, v8.4h, v1.4h\n"
+    "add x13, x13, #0x20\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "ldr d15, [x25, x17]\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v20.8h\n"
+    "ldr d22, [x24, x17]\n"
+    "smlal v7.4s, v4.4h, v23.4h\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v2.4s, v27.4h, v18.4h\n"
+    "smlal v10.4s, v27.4h, v26.4h\n"
+    "smlal2 v24.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v20.4h\n"
+    "smlal2 v0.4s, v4.8h, v23.8h\n"
+    "ldr d4, [x23, x17]\n"
+    "smlal2 v30.4s, v27.8h, v18.8h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v26.8h\n"
+    "ldr d26, [x22, x17]\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "usubl v26.8h, v26.8b, v14.8b\n"
+    "smlal v2.4s, v11.4h, v23.4h\n"
+    "smlal v10.4s, v15.4h, v1.4h\n"
+    "smlal2 v24.4s, v27.8h, v20.8h\n"
+    "smlal v9.4s, v11.4h, v5.4h\n"
+    "smlal2 v0.4s, v8.8h, v16.8h\n"
+    "ldr d8, [x21, x17]\n"
+    "smlal2 v30.4s, v11.8h, v23.8h\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v15.8h, v1.8h\n"
+    "smlal v7.4s, v27.4h, v25.4h\n"
+    "add x17, x17, #0x8\n"
+    "smlal v2.4s, v22.4h, v5.4h\n"
+    "smlal v10.4s, v4.4h, v18.4h\n"
+    "smlal2 v24.4s, v11.8h, v5.8h\n"
+    "smlal v9.4s, v22.4h, v31.4h\n"
+    "sqrdmulh v9.4s, v9.4s, v3.4s\n"
+    "smlal2 v0.4s, v27.8h, v25.8h\n"
+    "smlal2 v30.4s, v22.8h, v5.8h\n"
+    "and v27.16b, v9.16b, v17.16b\n"
+    "smlal2 v6.4s, v4.8h, v18.8h\n"
+    "smlal v7.4s, v15.4h, v18.4h\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "smlal v2.4s, v26.4h, v25.4h\n"
+    "smlal v10.4s, v26.4h, v31.4h\n"
+    "sqadd v9.4s, v9.4s, v27.4s\n"
+    "smlal2 v24.4s, v22.8h, v31.8h\n"
+    "smlal2 v0.4s, v15.8h, v18.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    "smlal2 v30.4s, v26.8h, v25.8h\n"
+    "smlal2 v6.4s, v26.8h, v31.8h\n"
+    "and v31.16b, v24.16b, v28.16b\n"
+    "smlal v7.4s, v4.4h, v20.4h\n"
+    "smlal v2.4s, v8.4h, v20.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v3.4s\n"
+    "smlal v10.4s, v8.4h, v25.4h\n"
+    "smlal2 v0.4s, v4.8h, v20.8h\n"
+    "sqrdmulh v2.4s, v2.4s, v3.4s\n"
+    "smlal2 v30.4s, v8.8h, v20.8h\n"
+    "smlal2 v6.4s, v8.8h, v25.8h\n"
+    "sqrdmulh v10.4s, v10.4s, v3.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v22.16b, v7.16b, v17.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+    "and v3.16b, v2.16b, v17.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "and v11.16b, v10.16b, v17.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v31.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v20.16b, v0.16b, v28.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "and v31.16b, v30.16b, v28.16b\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "and v18.16b, v6.16b, v28.16b\n"
+    "sqadd v7.4s, v7.4s, v22.4s\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v3.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v11.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v17.4s\n"
+    "srshl v7.4s, v7.4s, v17.4s\n"
+    "sqadd v0.4s, v0.4s, v20.4s\n"
+    "srshl v2.4s, v2.4s, v17.4s\n"
+    "sqadd v30.4s, v30.4s, v31.4s\n"
+    "srshl v10.4s, v10.4s, v17.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v28.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v28.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v28.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "str d9, [x11, x16]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "str d7, [x10, x16]\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "str d2, [x9, x16]\n"
+    "str d10, [x28, x16]\n"
+    "ldr q9, [x20, #0x0]\n"
+    "ldr q24, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "add x16, x16, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldp x23, x22, [x15, #0x0]\n"
+    "usubl v23.8h, v23.8b, v19.8b\n"
+    "usubl v16.8h, v16.8b, v19.8b\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr d22, [x23, x17]\n"
+    "usubl v1.8h, v1.8b, v19.8b\n"
+    "usubl v5.8h, v5.8b, v19.8b\n"
+    "ldr d4, [x22, x17]\n"
+    "ldr d8, [x21, x17]\n"
+    "usubl v26.8h, v26.8b, v19.8b\n"
+    "usubl v18.8h, v18.8b, v19.8b\n"
+    "ldr d27, [x20, x17]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "usubl v31.8h, v31.8b, v19.8b\n"
+    "usubl v25.8h, v25.8b, v19.8b\n"
+    "ldr d15, [x20, x17]\n"
+    "usubl v20.8h, v20.8b, v19.8b\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q28, [x13, #0x0]\n"
+    "ldr q17, [x12, #0x0]\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr q21, [x13, #0x10]\n"
+    "ldr q3, [x12, #0x10]\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "ldr d11, [x20, x17]\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "ldr x20, [x15, #0x38]\n"
+    "ldr d4, [x20, x17]\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "usubl v11.8h, v11.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "ldr x26, [x15, #0x48]\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x25, [x15, #0x50]\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "ldr d8, [x20, x17]\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "ldr x24, [x15, #0x58]\n"
+    "ldr x23, [x15, #0x60]\n"
+    "smlal v2.4s, v11.4h, v31.4h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "ldr x22, [x15, #0x68]\n"
+    "ldr x21, [x15, #0x70]\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "smlal v9.4s, v4.4h, v16.4h\n"
+    "ldr x20, [x15, #0x78]\n"
+    "tst x7, #0x7\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "ldr d27, [x26, x17]\n"
+    "smlal2 v30.4s, v11.8h, v31.8h\n"
+    "ldr d11, [x25, x17]\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "add x13, x13, #0x20\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal v10.4s, v22.4h, v20.4h\n"
+    "usubl v11.8h, v11.8b, v14.8b\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v24.4s, v4.8h, v16.8h\n"
+    "smlal v9.4s, v8.4h, v1.4h\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "ldr d15, [x24, x17]\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v20.8h\n"
+    "ldr d22, [x23, x17]\n"
+    "smlal v7.4s, v4.4h, v23.4h\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v2.4s, v27.4h, v18.4h\n"
+    "smlal v10.4s, v27.4h, v26.4h\n"
+    "smlal2 v24.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v20.4h\n"
+    "smlal2 v0.4s, v4.8h, v23.8h\n"
+    "ldr d4, [x22, x17]\n"
+    "smlal2 v30.4s, v27.8h, v18.8h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v26.8h\n"
+    "ldr d26, [x21, x17]\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "usubl v26.8h, v26.8b, v14.8b\n"
+    "smlal v2.4s, v11.4h, v23.4h\n"
+    "smlal v10.4s, v15.4h, v1.4h\n"
+    "smlal2 v24.4s, v27.8h, v20.8h\n"
+    "smlal v9.4s, v11.4h, v5.4h\n"
+    "smlal2 v0.4s, v8.8h, v16.8h\n"
+    "ldr d16, [x20, x17]\n"
+    "smlal2 v30.4s, v11.8h, v23.8h\n"
+    "usubl v16.8h, v16.8b, v14.8b\n"
+    "smlal2 v6.4s, v15.8h, v1.8h\n"
+    "smlal v7.4s, v27.4h, v25.4h\n"
+    "add x17, x17, #0x8\n"
+    "smlal v2.4s, v22.4h, v5.4h\n"
+    "smlal v10.4s, v4.4h, v18.4h\n"
+    "smlal2 v24.4s, v11.8h, v5.8h\n"
+    "smlal v9.4s, v22.4h, v31.4h\n"
+    "sqrdmulh v9.4s, v9.4s, v28.4s\n"
+    "smlal2 v0.4s, v27.8h, v25.8h\n"
+    "smlal2 v30.4s, v22.8h, v5.8h\n"
+    "and v1.16b, v9.16b, v17.16b\n"
+    "smlal2 v6.4s, v4.8h, v18.8h\n"
+    "smlal v7.4s, v15.4h, v18.4h\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "smlal v2.4s, v26.4h, v25.4h\n"
+    "smlal v10.4s, v26.4h, v31.4h\n"
+    "sqadd v9.4s, v9.4s, v1.4s\n"
+    "smlal2 v24.4s, v22.8h, v31.8h\n"
+    "smlal2 v0.4s, v15.8h, v18.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    "smlal2 v30.4s, v26.8h, v25.8h\n"
+    "smlal2 v6.4s, v26.8h, v31.8h\n"
+    "and v31.16b, v24.16b, v3.16b\n"
+    "smlal v7.4s, v4.4h, v20.4h\n"
+    "smlal v2.4s, v16.4h, v20.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v28.4s\n"
+    "smlal v10.4s, v16.4h, v25.4h\n"
+    "smlal2 v0.4s, v4.8h, v20.8h\n"
+    "sqrdmulh v2.4s, v2.4s, v28.4s\n"
+    "smlal2 v30.4s, v16.8h, v20.8h\n"
+    "smlal2 v6.4s, v16.8h, v25.8h\n"
+    "sqrdmulh v10.4s, v10.4s, v28.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v22.16b, v7.16b, v17.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+    "and v15.16b, v2.16b, v17.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "and v11.16b, v10.16b, v17.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v31.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v3.16b\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "and v23.16b, v30.16b, v3.16b\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "and v21.16b, v6.16b, v3.16b\n"
+    "sqadd v7.4s, v7.4s, v22.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v15.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v11.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v17.4s\n"
+    "srshl v7.4s, v7.4s, v17.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v2.4s, v2.4s, v17.4s\n"
+    "sqadd v30.4s, v30.4s, v23.4s\n"
+    "srshl v10.4s, v10.4s, v17.4s\n"
+    "sqadd v6.4s, v6.4s, v21.4s\n"
+    "srshl v24.4s, v24.4s, v3.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v3.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v3.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v3.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "str d9, [x11, x16]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "str d7, [x10, x16]\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "str d2, [x9, x16]\n"
+    "str d10, [x28, x16]\n"
+    "add x16, x16, #0x8\n"
+    "beq 64f\n"
+    "add x14, x14, #0x48\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x7, #2, 5f\n"
+    "ld1 { v9.4s }, [x20], #0x10\n"
+    "tbz x7, #1, 4f\n"
+    "ld1 { v24.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v24.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v24.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x7, #1, 6f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v9.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "usubl v23.8h, v23.8b, v19.8b\n"
+    "usubl v16.8h, v16.8b, v19.8b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "usubl v1.8h, v1.8b, v19.8b\n"
+    "usubl v5.8h, v5.8b, v19.8b\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "usubl v26.8h, v26.8b, v19.8b\n"
+    "usubl v18.8h, v18.8b, v19.8b\n"
+    "usubl v31.8h, v31.8b, v19.8b\n"
+    "usubl v25.8h, v25.8b, v19.8b\n"
+    "usubl v20.8h, v20.8b, v19.8b\n"
+    "add x24, x24, x17\n"
+    "add x23, x23, x17\n"
+    "add x22, x22, x17\n"
+    "add x21, x21, x17\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 9f\n"
+    "ld1 { v22.s }[0], [x24], #0x4\n"
+    "ld1 { v4.s }[0], [x23], #0x4\n"
+    "ld1 { v8.s }[0], [x22], #0x4\n"
+    "ld1 { v27.s }[0], [x21], #0x4\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 8f\n"
+    "ld1 { v22.h }[2], [x24], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v8.h }[2], [x22], #0x2\n"
+    "ld1 { v27.h }[2], [x21], #0x2\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[6], [x24]\n"
+    "ld1 { v4.b }[6], [x23]\n"
+    "ld1 { v8.b }[6], [x22]\n"
+    "ld1 { v27.b }[6], [x21]\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[4], [x24]\n"
+    "ld1 { v4.b }[4], [x23]\n"
+    "ld1 { v8.b }[4], [x22]\n"
+    "ld1 { v27.b }[4], [x21]\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x7, #1, 10f\n"
+    "ld1 { v22.h }[0], [x24], #0x2\n"
+    "ld1 { v4.h }[0], [x23], #0x2\n"
+    "ld1 { v8.h }[0], [x22], #0x2\n"
+    "ld1 { v27.h }[0], [x21], #0x2\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[2], [x24]\n"
+    "ld1 { v4.b }[2], [x23]\n"
+    "ld1 { v8.b }[2], [x22]\n"
+    "ld1 { v27.b }[2], [x21]\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[0], [x24]\n"
+    "ld1 { v4.b }[0], [x23]\n"
+    "ld1 { v8.b }[0], [x22]\n"
+    "ld1 { v27.b }[0], [x21]\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "tbz x7, #2, 13f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 12f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x7, #1, 14f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (3, 0): Bit 2: End
+    "usubl v21.8h, v21.8b, v14.8b\n"
+    "smlal v2.4s, v21.4h, v31.4h\n"
+    "smlal2 v30.4s, v21.8h, v31.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "tbz x7, #2, 17f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 16f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x7, #1, 18f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (3, 3): Bit 2: End
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "ldr x20, [x15, #0x38]\n"
+    "smlal v10.4s, v28.4h, v20.4h\n"
+    "smlal2 v6.4s, v28.8h, v20.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 21f\n"
+    "ld1 { v22.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 20f\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 1): Bit 2: Unset
+    "tbz x7, #1, 22f\n"
+    "ld1 { v22.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (0, 1): Bit 2: End
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v9.4s, v22.4h, v16.4h\n"
+    "smlal2 v24.4s, v22.8h, v16.8h\n"
+    "smlal v7.4s, v22.4h, v23.4h\n"
+    "smlal2 v0.4s, v22.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 25f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 24f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (0, 2): Bit 2: Unset
+    "tbz x7, #1, 26f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (0, 2): Bit 2: End
+    "usubl v21.8h, v21.8b, v14.8b\n"
+    "ldr x20, [x15, #0x48]\n"
+    "smlal v9.4s, v21.4h, v1.4h\n"
+    "smlal2 v24.4s, v21.8h, v1.8h\n"
+    "smlal v7.4s, v21.4h, v16.4h\n"
+    "smlal2 v0.4s, v21.8h, v16.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 29f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 28f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x7, #1, 30f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "ldr x20, [x15, #0x50]\n"
+    "smlal v9.4s, v28.4h, v20.4h\n"
+    "smlal2 v24.4s, v28.8h, v20.8h\n"
+    "smlal v7.4s, v28.4h, v25.4h\n"
+    "smlal2 v0.4s, v28.8h, v25.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v2.4s, v28.4h, v18.4h\n"
+    "smlal2 v30.4s, v28.8h, v18.8h\n"
+    "smlal v10.4s, v28.4h, v26.4h\n"
+    "smlal2 v6.4s, v28.8h, v26.8h\n"
+    "tbz x7, #2, 33f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 32f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (1, 0): Bit 2: Unset
+    "tbz x7, #1, 34f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (1, 0): Bit 2: End
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "ldr x20, [x15, #0x58]\n"
+    "smlal v9.4s, v8.4h, v5.4h\n"
+    "smlal2 v24.4s, v8.8h, v5.8h\n"
+    "smlal v2.4s, v8.4h, v23.4h\n"
+    "smlal2 v30.4s, v8.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 37f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 36f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x7, #1, 38f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (1, 3): Bit 2: End
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "ldr x20, [x15, #0x60]\n"
+    "smlal v7.4s, v8.4h, v18.4h\n"
+    "smlal2 v0.4s, v8.8h, v18.8h\n"
+    "smlal v10.4s, v8.4h, v1.4h\n"
+    "smlal2 v6.4s, v8.8h, v1.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 41f\n"
+    "ld1 { v17.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 40f\n"
+    "ld1 { v17.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x7, #1, 42f\n"
+    "ld1 { v17.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (2, 0): Bit 2: End
+    "usubl v17.8h, v17.8b, v14.8b\n"
+    "ldr x20, [x15, #0x68]\n"
+    "smlal v9.4s, v17.4h, v31.4h\n"
+    "smlal2 v24.4s, v17.8h, v31.8h\n"
+    "smlal v2.4s, v17.4h, v5.4h\n"
+    "smlal2 v30.4s, v17.8h, v5.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 45f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 44f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x7, #1, 46f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "usubl v23.8h, v23.8b, v14.8b\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v7.4s, v23.4h, v20.4h\n"
+    "smlal2 v0.4s, v23.8h, v20.8h\n"
+    "smlal v10.4s, v23.4h, v18.4h\n"
+    "smlal2 v6.4s, v23.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 49f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 48f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x7, #1, 50f\n"
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "usubl v5.8h, v5.8b, v14.8b\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v2.4s, v5.4h, v25.4h\n"
+    "smlal2 v30.4s, v5.8h, v25.8h\n"
+    "smlal v10.4s, v5.4h, v31.4h\n"
+    "smlal2 v6.4s, v5.8h, v31.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 53f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 52f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x7, #1, 54f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "usubl v23.8h, v23.8b, v14.8b\n"
+    "smlal v2.4s, v23.4h, v20.4h\n"
+    "smlal2 v30.4s, v23.8h, v20.8h\n"
+    "smlal v10.4s, v23.4h, v25.4h\n"
+    "smlal2 v6.4s, v23.8h, v25.8h\n"
+    "tbz x7, #2, 57f\n"
+    "ld1 { v15.4s }, [x13], #0x10\n"
+    "ld1 { v19.4s }, [x12], #0x10\n"
+    "tbz x7, #1, 56f\n"
+    "ld1 { v18.d }[0], [x13], #0x8\n"
+    "ld1 { v22.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v18.s }[2], [x13]\n"
+    "ld1 { v22.s }[2], [x12]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v18.s }[0], [x13]\n"
+    "ld1 { v22.s }[0], [x12]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x7, #1, 58f\n"
+    "ld1 { v15.d }[0], [x13], #0x8\n"
+    "ld1 { v19.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v15.s }[2], [x13]\n"
+    "ld1 { v19.s }[2], [x12]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v15.s }[0], [x13]\n"
+    "ld1 { v19.s }[0], [x12]\n"
+    "59:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v9.4s, v9.4s, v15.4s\n"
+    "and v17.16b, v9.16b, v19.16b\n"
+    "add x11, x11, x16\n"
+    "add x10, x10, x16\n"
+    "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "add x9, x9, x16\n"
+    "add x28, x28, x16\n"
+    "and v20.16b, v24.16b, v22.16b\n"
+    "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+    "sqrdmulh v2.4s, v2.4s, v15.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v15.4s\n"
+    "sqadd v9.4s, v9.4s, v17.4s\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "and v21.16b, v7.16b, v19.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+    "and v15.16b, v2.16b, v19.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+    "and v23.16b, v10.16b, v19.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v20.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v22.16b\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "and v17.16b, v30.16b, v22.16b\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v28.16b, v6.16b, v22.16b\n"
+    "sqadd v7.4s, v7.4s, v21.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v15.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v23.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v19.4s\n"
+    "srshl v7.4s, v7.4s, v19.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v2.4s, v2.4s, v19.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "srshl v10.4s, v10.4s, v19.4s\n"
+    "sqadd v6.4s, v6.4s, v28.4s\n"
+    "srshl v24.4s, v24.4s, v22.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v22.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v22.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v22.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "tbz x7, #2, 61f\n"
+    "st1 { v9.s }[0], [x11], #0x4\n"
+    "st1 { v7.s }[0], [x10], #0x4\n"
+    "st1 { v2.s }[0], [x9], #0x4\n"
+    "st1 { v10.s }[0], [x28], #0x4\n"
+    "tbz x7, #1, 60f\n"
+    "st1 { v9.h }[2], [x11], #0x2\n"
+    "st1 { v7.h }[2], [x10], #0x2\n"
+    "st1 { v2.h }[2], [x9], #0x2\n"
+    "st1 { v10.h }[2], [x28], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[6], [x11], #0x1\n"
+    "st1 { v7.b }[6], [x10], #0x1\n"
+    "st1 { v2.b }[6], [x9], #0x1\n"
+    "st1 { v10.b }[6], [x28], #0x1\n"
+    "b 63f\n"
+    "60:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[4], [x11], #0x1\n"
+    "st1 { v7.b }[4], [x10], #0x1\n"
+    "st1 { v2.b }[4], [x9], #0x1\n"
+    "st1 { v10.b }[4], [x28], #0x1\n"
+    "b 63f\n"
+    "61:"  // Oddments: Bit 2: Unset
+    "tbz x7, #1, 62f\n"
+    "st1 { v9.h }[0], [x11], #0x2\n"
+    "st1 { v7.h }[0], [x10], #0x2\n"
+    "st1 { v2.h }[0], [x9], #0x2\n"
+    "st1 { v10.h }[0], [x28], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[2], [x11], #0x1\n"
+    "st1 { v7.b }[2], [x10], #0x1\n"
+    "st1 { v2.b }[2], [x9], #0x1\n"
+    "st1 { v10.b }[2], [x28], #0x1\n"
+    "b 63f\n"
+    "62:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[0], [x11], #0x1\n"
+    "st1 { v7.b }[0], [x10], #0x1\n"
+    "st1 { v2.b }[0], [x9], #0x1\n"
+    "st1 { v10.b }[0], [x28], #0x1\n"
+    "63:"  // Oddments: Bit 2: End
+    "64:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..52280ebe70
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const uint8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c4184622b0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x8, x7, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v6.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v15.16b }, [x21]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v17.8h }, [x21]\n"
+    "ld1r { v24.8h }, [x20]\n"
+    "mov x17, #0x0\n"
+    "mov x16, #0x0\n"
+    "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x11, x10, [x22, #0x0]\n"
+    "ldp x9, x28, [x22, #0x10]\n"
+    "cbz x8, 3f\n"
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "subs x8, x8, #0x1\n"
+    "usubl v11.8h, v11.8b, v15.8b\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "usubl v22.8h, v22.8b, v15.8b\n"
+    "usubl v14.8h, v14.8b, v15.8b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "usubl v28.8h, v28.8b, v15.8b\n"
+    "usubl v18.8h, v18.8b, v15.8b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "usubl v9.8h, v9.8b, v15.8b\n"
+    "usubl v26.8h, v26.8b, v15.8b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "usubl v7.8h, v7.8b, v15.8b\n"
+    "usubl v4.8h, v4.8b, v15.8b\n"
+    "ldr q5, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d25, [x27, x17]\n"
+    "ldr d27, [x26, x17]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d1, [x25, x17]\n"
+    "ldr d2, [x24, x17]\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "ldr d12, [x23, x17]\n"
+    "ldr d16, [x22, x17]\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "ldr d23, [x21, x17]\n"
+    "ldr d10, [x20, x17]\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr q30, [x13, #0x0]\n"
+    "ldr q29, [x12, #0x0]\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x21, [x15, #0x58]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "ldr x25, [x15, #0x60]\n"
+    "ldr x24, [x15, #0x80]\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x88]\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "ldr d25, [x20, x17]\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal v20.4s, v27.4h, v28.4h\n"
+    "smlal v19.4s, v25.4h, v18.4h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "ldr d1, [x25, x17]\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "ldr d2, [x24, x17]\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v28.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v18.8h\n"
+    "ldr d25, [x22, x17]\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "ldr x25, [x15, #0x98]\n"
+    "smlal v20.4s, v1.4h, v11.4h\n"
+    "smlal v19.4s, v2.4h, v22.4h\n"
+    "ldr x24, [x15, #0x50]\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "ldr d16, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "ldr d12, [x20, x17]\n"
+    "ldr x23, [x15, #0x48]\n"
+    "smlal2 v0.4s, v1.8h, v11.8h\n"
+    "smlal2 v31.4s, v2.8h, v22.8h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal v20.4s, v27.4h, v18.4h\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "smlal v19.4s, v25.4h, v9.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "ldr d23, [x25, x17]\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "ldr d11, [x24, x17]\n"
+    "usubl v11.8h, v11.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v18.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v9.8h\n"
+    "ldr d25, [x21, x17]\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "smlal v21.4s, v16.4h, v18.4h\n"
+    "smlal v20.4s, v12.4h, v22.4h\n"
+    "smlal v19.4s, v23.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "ldr d10, [x20, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v5.4s, v11.4h, v9.4h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal2 v8.4s, v16.8h, v18.8h\n"
+    "ldr d18, [x22, x17]\n"
+    "ldr d16, [x21, x17]\n"
+    "smlal2 v0.4s, v12.8h, v22.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal2 v31.4s, v23.8h, v14.8h\n"
+    "ldr q14, [x13, #0x10]\n"
+    "smlal v21.4s, v27.4h, v9.4h\n"
+    "smlal v20.4s, v25.4h, v26.4h\n"
+    "smlal v19.4s, v10.4h, v28.4h\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "ldr x21, [x15, #0xc0]\n"
+    "smlal2 v3.4s, v11.8h, v9.8h\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v26.4h\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v8.4s, v27.8h, v9.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "smlal2 v0.4s, v25.8h, v26.8h\n"
+    "ldr q25, [x12, #0x10]\n"
+    "smlal2 v31.4s, v10.8h, v28.8h\n"
+    "smlal v21.4s, v11.4h, v28.4h\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "add x14, x14, #0x48\n"
+    "smlal v20.4s, v18.4h, v7.4h\n"
+    "smlal v19.4s, v16.4h, v7.4h\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "add x17, x17, #0x8\n"
+    "smlal2 v3.4s, v1.8h, v26.8h\n"
+    "smlal v5.4s, v12.4h, v7.4h\n"
+    "sqrdmulh v5.4s, v5.4s, v30.4s\n"
+    "subs x8, x8, #0x1\n"
+    "smlal2 v8.4s, v11.8h, v28.8h\n"
+    "smlal2 v0.4s, v18.8h, v7.8h\n"
+    "and v28.16b, v5.16b, v29.16b\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v31.4s, v16.8h, v7.8h\n"
+    "smlal v21.4s, v2.4h, v7.4h\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "add x12, x12, #0x20\n"
+    "smlal v20.4s, v10.4h, v9.4h\n"
+    "smlal v19.4s, v22.4h, v26.4h\n"
+    "sqadd v5.4s, v5.4s, v28.4s\n"
+    "smlal2 v3.4s, v12.8h, v7.8h\n"
+    "smlal2 v8.4s, v2.8h, v7.8h\n"
+    "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+    "smlal2 v0.4s, v10.8h, v9.8h\n"
+    "smlal2 v31.4s, v22.8h, v26.8h\n"
+    "and v16.16b, v3.16b, v25.16b\n"
+    "smlal v21.4s, v23.4h, v4.4h\n"
+    "smlal v20.4s, v22.4h, v4.4h\n"
+    "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+    "smlal v19.4s, v27.4h, v4.4h\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+    "smlal2 v0.4s, v22.8h, v4.8h\n"
+    "smlal2 v31.4s, v27.8h, v4.8h\n"
+    "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v12.16b, v21.16b, v29.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+    "and v23.16b, v20.16b, v29.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+    "and v9.16b, v19.16b, v29.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "and v18.16b, v8.16b, v25.16b\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v22.16b, v0.16b, v25.16b\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v25.16b\n"
+    "sqadd v21.4s, v21.4s, v12.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v23.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v9.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v29.4s\n"
+    "srshl v21.4s, v21.4s, v29.4s\n"
+    "sqadd v8.4s, v8.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v29.4s\n"
+    "sqadd v0.4s, v0.4s, v22.4s\n"
+    "srshl v19.4s, v19.4s, v29.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v25.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v25.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v25.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v25.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "str d5, [x11, x16]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d21, [x10, x16]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str d20, [x9, x16]\n"
+    "str d19, [x28, x16]\n"
+    "ldr q5, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "add x16, x16, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "usubl v11.8h, v11.8b, v15.8b\n"
+    "usubl v22.8h, v22.8b, v15.8b\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "usubl v14.8h, v14.8b, v15.8b\n"
+    "usubl v28.8h, v28.8b, v15.8b\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ldr d25, [x27, x17]\n"
+    "usubl v18.8h, v18.8b, v15.8b\n"
+    "usubl v9.8h, v9.8b, v15.8b\n"
+    "ldr d27, [x26, x17]\n"
+    "ldr d1, [x25, x17]\n"
+    "usubl v26.8h, v26.8b, v15.8b\n"
+    "usubl v7.8h, v7.8b, v15.8b\n"
+    "ldr d2, [x24, x17]\n"
+    "ldr d12, [x23, x17]\n"
+    "usubl v4.8h, v4.8b, v15.8b\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "ldr d16, [x22, x17]\n"
+    "ldr d23, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "ldr d10, [x20, x17]\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q29, [x13, #0x0]\n"
+    "ldr q30, [x12, #0x0]\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x21, [x15, #0x58]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "ldr x25, [x15, #0x60]\n"
+    "ldr x24, [x15, #0x80]\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x88]\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "ldr d25, [x20, x17]\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal v20.4s, v27.4h, v28.4h\n"
+    "smlal v19.4s, v25.4h, v18.4h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "ldr d1, [x25, x17]\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "ldr d2, [x24, x17]\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v28.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v18.8h\n"
+    "ldr d25, [x22, x17]\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "ldr x25, [x15, #0x98]\n"
+    "smlal v20.4s, v1.4h, v11.4h\n"
+    "smlal v19.4s, v2.4h, v22.4h\n"
+    "ldr x24, [x15, #0x50]\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "ldr d16, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "ldr d12, [x20, x17]\n"
+    "ldr x23, [x15, #0x48]\n"
+    "smlal2 v0.4s, v1.8h, v11.8h\n"
+    "smlal2 v31.4s, v2.8h, v22.8h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal v20.4s, v27.4h, v18.4h\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "smlal v19.4s, v25.4h, v9.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "ldr d23, [x25, x17]\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "ldr d11, [x24, x17]\n"
+    "usubl v11.8h, v11.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v18.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v9.8h\n"
+    "ldr d25, [x21, x17]\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "smlal v21.4s, v16.4h, v18.4h\n"
+    "smlal v20.4s, v12.4h, v22.4h\n"
+    "smlal v19.4s, v23.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "ldr d10, [x20, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v5.4s, v11.4h, v9.4h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal2 v8.4s, v16.8h, v18.8h\n"
+    "ldr d16, [x22, x17]\n"
+    "ldr d18, [x21, x17]\n"
+    "smlal2 v0.4s, v12.8h, v22.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal2 v31.4s, v23.8h, v14.8h\n"
+    "ldr q14, [x13, #0x10]\n"
+    "smlal v21.4s, v27.4h, v9.4h\n"
+    "smlal v20.4s, v25.4h, v26.4h\n"
+    "smlal v19.4s, v10.4h, v28.4h\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "smlal2 v3.4s, v11.8h, v9.8h\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v26.4h\n"
+    "tst x7, #0x7\n"
+    "smlal2 v8.4s, v27.8h, v9.8h\n"
+    "ldr d27, [x20, x17]\n"
+    "smlal2 v0.4s, v25.8h, v26.8h\n"
+    "ldr q25, [x12, #0x10]\n"
+    "smlal2 v31.4s, v10.8h, v28.8h\n"
+    "smlal v21.4s, v11.4h, v28.4h\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "add x17, x17, #0x8\n"
+    "smlal v20.4s, v16.4h, v7.4h\n"
+    "smlal v19.4s, v18.4h, v7.4h\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v3.4s, v1.8h, v26.8h\n"
+    "smlal v5.4s, v12.4h, v7.4h\n"
+    "sqrdmulh v5.4s, v5.4s, v29.4s\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v8.4s, v11.8h, v28.8h\n"
+    "smlal2 v0.4s, v16.8h, v7.8h\n"
+    "and v16.16b, v5.16b, v30.16b\n"
+    "smlal2 v31.4s, v18.8h, v7.8h\n"
+    "smlal v21.4s, v2.4h, v7.4h\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smlal v20.4s, v10.4h, v9.4h\n"
+    "smlal v19.4s, v22.4h, v26.4h\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "smlal2 v3.4s, v12.8h, v7.8h\n"
+    "smlal2 v8.4s, v2.8h, v7.8h\n"
+    "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+    "smlal2 v0.4s, v10.8h, v9.8h\n"
+    "smlal2 v31.4s, v22.8h, v26.8h\n"
+    "and v16.16b, v3.16b, v25.16b\n"
+    "smlal v21.4s, v23.4h, v4.4h\n"
+    "smlal v20.4s, v22.4h, v4.4h\n"
+    "sqrdmulh v21.4s, v21.4s, v29.4s\n"
+    "smlal v19.4s, v27.4h, v4.4h\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "sqrdmulh v20.4s, v20.4s, v29.4s\n"
+    "smlal2 v0.4s, v22.8h, v4.8h\n"
+    "smlal2 v31.4s, v27.8h, v4.8h\n"
+    "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v23.16b, v21.16b, v30.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+    "and v27.16b, v20.16b, v30.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+    "and v22.16b, v19.16b, v30.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v14.16b, v8.16b, v25.16b\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v25.16b\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v25.16b\n"
+    "sqadd v21.4s, v21.4s, v23.4s\n"
+    "sshr v14.4s, v14.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v27.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v22.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v30.4s\n"
+    "srshl v21.4s, v21.4s, v30.4s\n"
+    "sqadd v8.4s, v8.4s, v14.4s\n"
+    "srshl v20.4s, v20.4s, v30.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v19.4s, v19.4s, v30.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v25.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v25.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v25.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v25.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "str d5, [x11, x16]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d21, [x10, x16]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str d20, [x9, x16]\n"
+    "str d19, [x28, x16]\n"
+    "add x16, x16, #0x8\n"
+    "beq 88f\n"
+    "add x14, x14, #0x48\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x7, #2, 5f\n"
+    "ld1 { v5.4s }, [x20], #0x10\n"
+    "tbz x7, #1, 4f\n"
+    "ld1 { v3.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v3.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v3.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x7, #1, 6f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v5.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v5.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "usubl v11.8h, v11.8b, v15.8b\n"
+    "usubl v22.8h, v22.8b, v15.8b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "usubl v14.8h, v14.8b, v15.8b\n"
+    "usubl v28.8h, v28.8b, v15.8b\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "usubl v18.8h, v18.8b, v15.8b\n"
+    "usubl v9.8h, v9.8b, v15.8b\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "usubl v26.8h, v26.8b, v15.8b\n"
+    "usubl v7.8h, v7.8b, v15.8b\n"
+    "usubl v4.8h, v4.8b, v15.8b\n"
+    "add x27, x27, x17\n"
+    "add x26, x26, x17\n"
+    "add x25, x25, x17\n"
+    "add x24, x24, x17\n"
+    "add x23, x23, x17\n"
+    "add x22, x22, x17\n"
+    "add x21, x21, x17\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 9f\n"
+    "ld1 { v25.s }[0], [x27], #0x4\n"
+    "ld1 { v27.s }[0], [x26], #0x4\n"
+    "ld1 { v1.s }[0], [x25], #0x4\n"
+    "ld1 { v2.s }[0], [x24], #0x4\n"
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "ld1 { v16.s }[0], [x22], #0x4\n"
+    "ld1 { v23.s }[0], [x21], #0x4\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 8f\n"
+    "ld1 { v25.h }[2], [x27], #0x2\n"
+    "ld1 { v27.h }[2], [x26], #0x2\n"
+    "ld1 { v1.h }[2], [x25], #0x2\n"
+    "ld1 { v2.h }[2], [x24], #0x2\n"
+    "ld1 { v12.h }[2], [x23], #0x2\n"
+    "ld1 { v16.h }[2], [x22], #0x2\n"
+    "ld1 { v23.h }[2], [x21], #0x2\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[6], [x27]\n"
+    "ld1 { v27.b }[6], [x26]\n"
+    "ld1 { v1.b }[6], [x25]\n"
+    "ld1 { v2.b }[6], [x24]\n"
+    "ld1 { v12.b }[6], [x23]\n"
+    "ld1 { v16.b }[6], [x22]\n"
+    "ld1 { v23.b }[6], [x21]\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[4], [x27]\n"
+    "ld1 { v27.b }[4], [x26]\n"
+    "ld1 { v1.b }[4], [x25]\n"
+    "ld1 { v2.b }[4], [x24]\n"
+    "ld1 { v12.b }[4], [x23]\n"
+    "ld1 { v16.b }[4], [x22]\n"
+    "ld1 { v23.b }[4], [x21]\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x7, #1, 10f\n"
+    "ld1 { v25.h }[0], [x27], #0x2\n"
+    "ld1 { v27.h }[0], [x26], #0x2\n"
+    "ld1 { v1.h }[0], [x25], #0x2\n"
+    "ld1 { v2.h }[0], [x24], #0x2\n"
+    "ld1 { v12.h }[0], [x23], #0x2\n"
+    "ld1 { v16.h }[0], [x22], #0x2\n"
+    "ld1 { v23.h }[0], [x21], #0x2\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[2], [x27]\n"
+    "ld1 { v27.b }[2], [x26]\n"
+    "ld1 { v1.b }[2], [x25]\n"
+    "ld1 { v2.b }[2], [x24]\n"
+    "ld1 { v12.b }[2], [x23]\n"
+    "ld1 { v16.b }[2], [x22]\n"
+    "ld1 { v23.b }[2], [x21]\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[0], [x27]\n"
+    "ld1 { v27.b }[0], [x26]\n"
+    "ld1 { v1.b }[0], [x25]\n"
+    "ld1 { v2.b }[0], [x24]\n"
+    "ld1 { v12.b }[0], [x23]\n"
+    "ld1 { v16.b }[0], [x22]\n"
+    "ld1 { v23.b }[0], [x21]\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "tbz x7, #2, 13f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 12f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x7, #1, 14f\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "ldr x20, [x15, #0x48]\n"
+    "smlal v21.4s, v15.4h, v18.4h\n"
+    "smlal2 v8.4s, v15.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 17f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 16f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x7, #1, 18f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x50]\n"
+    "smlal v21.4s, v16.4h, v9.4h\n"
+    "smlal2 v8.4s, v16.8h, v9.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 21f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 20f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (1, 2): Bit 2: Unset
+    "tbz x7, #1, 22f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (1, 2): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x58]\n"
+    "smlal v5.4s, v16.4h, v9.4h\n"
+    "smlal2 v3.4s, v16.8h, v9.8h\n"
+    "smlal v21.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 25f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 24f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x7, #1, 26f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (3, 0): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x60]\n"
+    "smlal v20.4s, v16.4h, v28.4h\n"
+    "smlal2 v0.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 29f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 28f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x7, #1, 30f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 0): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x68]\n"
+    "smlal v5.4s, v16.4h, v26.4h\n"
+    "smlal2 v3.4s, v16.8h, v26.8h\n"
+    "smlal v20.4s, v16.4h, v11.4h\n"
+    "smlal2 v0.4s, v16.8h, v11.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 33f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 32f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x7, #1, 34f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (3, 1): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v20.4s, v16.4h, v18.4h\n"
+    "smlal2 v0.4s, v16.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 37f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 36f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x7, #1, 38f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 1): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v16.4h, v7.4h\n"
+    "smlal2 v3.4s, v16.8h, v7.8h\n"
+    "smlal v20.4s, v16.4h, v22.4h\n"
+    "smlal2 v0.4s, v16.8h, v22.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 41f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 40f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x7, #1, 42f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (3, 3): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x80]\n"
+    "smlal v19.4s, v16.4h, v18.4h\n"
+    "smlal2 v31.4s, v16.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 45f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 44f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x7, #1, 46f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x88]\n"
+    "smlal v21.4s, v16.4h, v7.4h\n"
+    "smlal2 v8.4s, v16.8h, v7.8h\n"
+    "smlal v19.4s, v16.4h, v22.4h\n"
+    "smlal2 v31.4s, v16.8h, v22.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 49f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 48f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x7, #1, 50f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 4): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x90]\n"
+    "smlal v19.4s, v16.4h, v9.4h\n"
+    "smlal2 v31.4s, v16.8h, v9.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 53f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 52f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x7, #1, 54f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (4, 0): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x98]\n"
+    "smlal v20.4s, v16.4h, v26.4h\n"
+    "smlal2 v0.4s, v16.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 57f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 56f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x7, #1, 58f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "59:"  // Oddments: Load (2, 4): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xa0]\n"
+    "smlal v21.4s, v16.4h, v4.4h\n"
+    "smlal2 v8.4s, v16.8h, v4.8h\n"
+    "smlal v19.4s, v16.4h, v14.4h\n"
+    "smlal2 v31.4s, v16.8h, v14.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 61f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 60f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x7, #1, 62f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "63:"  // Oddments: Load (4, 1): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v20.4s, v16.4h, v7.4h\n"
+    "smlal2 v0.4s, v16.8h, v7.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 65f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 64f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x7, #1, 66f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "67:"  // Oddments: Load (3, 2): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "smlal v20.4s, v16.4h, v9.4h\n"
+    "smlal2 v0.4s, v16.8h, v9.8h\n"
+    "smlal v19.4s, v16.4h, v28.4h\n"
+    "smlal2 v31.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 69f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 68f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x7, #1, 70f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "71:"  // Oddments: Load (4, 3): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal v19.4s, v16.4h, v7.4h\n"
+    "smlal2 v31.4s, v16.8h, v7.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 73f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 72f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x7, #1, 74f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 2): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "smlal v20.4s, v16.4h, v4.4h\n"
+    "smlal2 v0.4s, v16.8h, v4.8h\n"
+    "smlal v19.4s, v16.4h, v26.4h\n"
+    "smlal2 v31.4s, v16.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 77f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 76f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x7, #1, 78f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "79:"  // Oddments: Load (4, 4): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v19.4s, v16.4h, v4.4h\n"
+    "smlal2 v31.4s, v16.8h, v4.8h\n"
+    "tbz x7, #2, 81f\n"
+    "ld1 { v14.4s }, [x13], #0x10\n"
+    "ld1 { v25.4s }, [x12], #0x10\n"
+    "tbz x7, #1, 80f\n"
+    "ld1 { v18.d }[0], [x13], #0x8\n"
+    "ld1 { v12.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 83f\n"
+    "ld1 { v18.s }[2], [x13]\n"
+    "ld1 { v12.s }[2], [x12]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 83f\n"
+    "ld1 { v18.s }[0], [x13]\n"
+    "ld1 { v12.s }[0], [x12]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x7, #1, 82f\n"
+    "ld1 { v14.d }[0], [x13], #0x8\n"
+    "ld1 { v25.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 83f\n"
+    "ld1 { v14.s }[2], [x13]\n"
+    "ld1 { v25.s }[2], [x12]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 83f\n"
+    "ld1 { v14.s }[0], [x13]\n"
+    "ld1 { v25.s }[0], [x12]\n"
+    "83:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+    "and v28.16b, v5.16b, v25.16b\n"
+    "add x11, x11, x16\n"
+    "add x10, x10, x16\n"
+    "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "add x9, x9, x16\n"
+    "add x28, x28, x16\n"
+    "and v16.16b, v3.16b, v12.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v14.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v14.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+    "sqadd v5.4s, v5.4s, v28.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v14.16b, v21.16b, v25.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+    "and v6.16b, v20.16b, v25.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+    "and v4.16b, v19.16b, v25.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v14.4s, v14.4s, #0x1f\n"
+    "and v18.16b, v8.16b, v12.16b\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "and v7.16b, v0.16b, v12.16b\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v12.16b\n"
+    "sqadd v21.4s, v21.4s, v14.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v6.4s\n"
+    "sshr v7.4s, v7.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v4.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v25.4s\n"
+    "srshl v21.4s, v21.4s, v25.4s\n"
+    "sqadd v8.4s, v8.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v25.4s\n"
+    "sqadd v0.4s, v0.4s, v7.4s\n"
+    "srshl v19.4s, v19.4s, v25.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v12.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v12.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v12.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v12.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "tbz x7, #2, 85f\n"
+    "st1 { v5.s }[0], [x11], #0x4\n"
+    "st1 { v21.s }[0], [x10], #0x4\n"
+    "st1 { v20.s }[0], [x9], #0x4\n"
+    "st1 { v19.s }[0], [x28], #0x4\n"
+    "tbz x7, #1, 84f\n"
+    "st1 { v5.h }[2], [x11], #0x2\n"
+    "st1 { v21.h }[2], [x10], #0x2\n"
+    "st1 { v20.h }[2], [x9], #0x2\n"
+    "st1 { v19.h }[2], [x28], #0x2\n"
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[6], [x11], #0x1\n"
+    "st1 { v21.b }[6], [x10], #0x1\n"
+    "st1 { v20.b }[6], [x9], #0x1\n"
+    "st1 { v19.b }[6], [x28], #0x1\n"
+    "b 87f\n"
+    "84:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[4], [x11], #0x1\n"
+    "st1 { v21.b }[4], [x10], #0x1\n"
+    "st1 { v20.b }[4], [x9], #0x1\n"
+    "st1 { v19.b }[4], [x28], #0x1\n"
+    "b 87f\n"
+    "85:"  // Oddments: Bit 2: Unset
+    "tbz x7, #1, 86f\n"
+    "st1 { v5.h }[0], [x11], #0x2\n"
+    "st1 { v21.h }[0], [x10], #0x2\n"
+    "st1 { v20.h }[0], [x9], #0x2\n"
+    "st1 { v19.h }[0], [x28], #0x2\n"
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[2], [x11], #0x1\n"
+    "st1 { v21.b }[2], [x10], #0x1\n"
+    "st1 { v20.b }[2], [x9], #0x1\n"
+    "st1 { v19.b }[2], [x28], #0x1\n"
+    "b 87f\n"
+    "86:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[0], [x11], #0x1\n"
+    "st1 { v21.b }[0], [x10], #0x1\n"
+    "st1 { v20.b }[0], [x9], #0x1\n"
+    "st1 { v19.b }[0], [x28], #0x1\n"
+    "87:"  // Oddments: Bit 2: End
+    "88:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..07f66fb482
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const uint8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..a3fa93df9c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,2187 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x2, x1, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v18.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v13.16b }, [x21]\n"
+    "ld1r { v26.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v11.8h }, [x21]\n"
+    "ld1r { v0.8h }, [x20]\n"
+    "mov x3, #0x0\n"
+    "mov x4, #0x0\n"
+    "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x17, x16, [x22, #0x0]\n"
+    "ldp x15, x14, [x22, #0x10]\n"
+    "cbz x2, 3f\n"
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "subs x2, x2, #0x1\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "usubl v14.8h, v14.8b, v13.8b\n"
+    "usubl v10.8h, v10.8b, v13.8b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "usubl v12.8h, v12.8b, v13.8b\n"
+    "ldr q7, [x20, #0x0]\n"
+    "ldr q15, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "ldr d31, [x9, x3]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldr d17, [x28, x3]\n"
+    "ldr d30, [x27, x3]\n"
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "ldr d16, [x26, x3]\n"
+    "ldr d3, [x25, x3]\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "ldr d4, [x24, x3]\n"
+    "ldr d25, [x23, x3]\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "ldr d9, [x22, x3]\n"
+    "ldr d29, [x21, x3]\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "ldr d28, [x20, x3]\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr d2, [x6, #0x28]\n"
+    "ldr d27, [x6, #0x30]\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "ldr d1, [x6, #0x38]\n"
+    "ldr d31, [x6, #0x40]\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "ldr d8, [x6, #0x48]\n"
+    "ldr x22, [x5, #0x50]\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "ldr x20, [x5, #0x58]\n"
+    "ldr x21, [x5, #0x60]\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "usubl v27.8h, v27.8b, v13.8b\n"
+    "ldr x22, [x5, #0x70]\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "ldr d14, [x20, x3]\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal v23.4s, v17.4h, v10.4h\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x20, [x5, #0x78]\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "usubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x21, [x5, #0x80]\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "ldr d25, [x22, x3]\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v10.8h\n"
+    "ldr d10, [x20, x3]\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "smlal v24.4s, v17.4h, v21.4h\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "usubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x24, [x5, #0x88]\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "smlal v7.4s, v30.4h, v2.4h\n"
+    "ldr x20, [x5, #0x90]\n"
+    "ldr x23, [x5, #0x98]\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "ldr d9, [x21, x3]\n"
+    "smlal2 v22.4s, v17.8h, v21.8h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "ldr d21, [x6, #0x50]\n"
+    "smlal v20.4s, v3.4h, v12.4h\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "ldr x22, [x5, #0xa0]\n"
+    "ldr x21, [x5, #0xa8]\n"
+    "smlal2 v15.4s, v30.8h, v2.8h\n"
+    "ldr d30, [x24, x3]\n"
+    "smlal v7.4s, v16.4h, v27.4h\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v12.8h\n"
+    "ldr d3, [x6, #0x58]\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "ldr d12, [x20, x3]\n"
+    "smlal v20.4s, v16.4h, v2.4h\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "smlal v24.4s, v28.4h, v2.4h\n"
+    "smlal v23.4s, v14.4h, v2.4h\n"
+    "ldr x20, [x5, #0xb0]\n"
+    "ldr x13, [x5, #0xb8]\n"
+    "smlal2 v15.4s, v16.8h, v27.8h\n"
+    "smlal v7.4s, v4.4h, v1.4h\n"
+    "ldr x12, [x5, #0xc0]\n"
+    "ldr x11, [x5, #0xc8]\n"
+    "smlal2 v5.4s, v16.8h, v2.8h\n"
+    "ldr d16, [x23, x3]\n"
+    "smlal2 v22.4s, v28.8h, v2.8h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v2.8h\n"
+    "ldr d2, [x6, #0x60]\n"
+    "smlal v20.4s, v4.4h, v27.4h\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v27.4h\n"
+    "smlal v23.4s, v25.4h, v27.4h\n"
+    "ldr x10, [x5, #0xd0]\n"
+    "ldr x9, [x5, #0xd8]\n"
+    "smlal2 v15.4s, v4.8h, v1.8h\n"
+    "smlal v7.4s, v17.4h, v31.4h\n"
+    "ldr x28, [x5, #0xe0]\n"
+    "ldr x27, [x5, #0xe8]\n"
+    "smlal2 v5.4s, v4.8h, v27.8h\n"
+    "ldr d4, [x22, x3]\n"
+    "smlal2 v22.4s, v14.8h, v27.8h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v27.8h\n"
+    "ldr d27, [x6, #0x68]\n"
+    "smlal v20.4s, v17.4h, v1.4h\n"
+    "usubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v1.4h\n"
+    "smlal v23.4s, v10.4h, v1.4h\n"
+    "ldr x26, [x5, #0xf0]\n"
+    "ldr x25, [x5, #0xf8]\n"
+    "smlal2 v15.4s, v17.8h, v31.8h\n"
+    "smlal v7.4s, v6.4h, v8.4h\n"
+    "ldr x24, [x5, #0x100]\n"
+    "ldr x23, [x5, #0x108]\n"
+    "smlal2 v5.4s, v17.8h, v1.8h\n"
+    "ldr d17, [x21, x3]\n"
+    "smlal2 v22.4s, v25.8h, v1.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v1.8h\n"
+    "ldr d1, [x6, #0x70]\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v31.4h\n"
+    "smlal v23.4s, v9.4h, v31.4h\n"
+    "ldr x22, [x5, #0x110]\n"
+    "ldr x21, [x5, #0x118]\n"
+    "smlal2 v15.4s, v6.8h, v8.8h\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "subs x2, x2, #0x1\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal2 v22.4s, v10.8h, v31.8h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v31.8h\n"
+    "ldr d31, [x6, #0x78]\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "usubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v9.4h, v8.4h\n"
+    "smlal v23.4s, v30.4h, v8.4h\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "ldr d28, [x13, x3]\n"
+    "smlal v7.4s, v14.4h, v3.4h\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v5.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x6, #0x80]\n"
+    "smlal2 v22.4s, v9.8h, v8.8h\n"
+    "usubl v29.8h, v29.8b, v13.8b\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "ldr d8, [x12, x3]\n"
+    "smlal v20.4s, v14.4h, v21.4h\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "smlal v24.4s, v12.4h, v21.4h\n"
+    "smlal v23.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v14.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v14.8h, v21.8h\n"
+    "ldr d14, [x11, x3]\n"
+    "smlal2 v22.4s, v12.8h, v21.8h\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v21.8h\n"
+    "ldr d21, [x6, #0x88]\n"
+    "smlal v20.4s, v25.4h, v3.4h\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v16.4h, v3.4h\n"
+    "smlal v23.4s, v4.4h, v3.4h\n"
+    "smlal2 v15.4s, v25.8h, v2.8h\n"
+    "smlal v7.4s, v10.4h, v27.4h\n"
+    "smlal2 v5.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x10, x3]\n"
+    "smlal2 v22.4s, v16.8h, v3.8h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v3.8h\n"
+    "ldr d3, [x6, #0x90]\n"
+    "smlal v20.4s, v10.4h, v2.4h\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v4.4h, v2.4h\n"
+    "smlal v23.4s, v17.4h, v2.4h\n"
+    "smlal2 v15.4s, v10.8h, v27.8h\n"
+    "smlal v7.4s, v9.4h, v1.4h\n"
+    "smlal2 v5.4s, v10.8h, v2.8h\n"
+    "ldr d10, [x9, x3]\n"
+    "smlal2 v22.4s, v4.8h, v2.8h\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v2.8h\n"
+    "ldr d2, [x6, #0x98]\n"
+    "smlal v20.4s, v9.4h, v27.4h\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v17.4h, v27.4h\n"
+    "smlal v23.4s, v6.4h, v27.4h\n"
+    "smlal2 v15.4s, v9.8h, v1.8h\n"
+    "smlal v7.4s, v12.4h, v31.4h\n"
+    "smlal2 v5.4s, v9.8h, v27.8h\n"
+    "ldr d9, [x28, x3]\n"
+    "smlal2 v22.4s, v17.8h, v27.8h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v27.8h\n"
+    "ldr d27, [x6, #0xa0]\n"
+    "smlal v20.4s, v30.4h, v1.4h\n"
+    "usubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v1.4h\n"
+    "smlal v23.4s, v28.4h, v1.4h\n"
+    "smlal2 v15.4s, v12.8h, v31.8h\n"
+    "ldr d12, [x27, x3]\n"
+    "smlal v7.4s, v16.4h, v29.4h\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "smlal2 v5.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x6, #0xa8]\n"
+    "smlal2 v22.4s, v6.8h, v1.8h\n"
+    "usubl v30.8h, v30.8b, v13.8b\n"
+    "smlal2 v19.4s, v28.8h, v1.8h\n"
+    "ldr d1, [x26, x3]\n"
+    "smlal v20.4s, v16.4h, v31.4h\n"
+    "usubl v1.8h, v1.8b, v18.8b\n"
+    "smlal v24.4s, v8.4h, v31.4h\n"
+    "smlal v23.4s, v14.4h, v31.4h\n"
+    "smlal2 v15.4s, v16.8h, v29.8h\n"
+    "smlal v7.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v16.8h, v31.8h\n"
+    "ldr d16, [x25, x3]\n"
+    "smlal2 v22.4s, v8.8h, v31.8h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v31.8h\n"
+    "ldr d31, [x6, #0xb0]\n"
+    "smlal v20.4s, v4.4h, v29.4h\n"
+    "usubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v29.4h\n"
+    "smlal v23.4s, v25.4h, v29.4h\n"
+    "smlal2 v15.4s, v4.8h, v21.8h\n"
+    "smlal v7.4s, v17.4h, v3.4h\n"
+    "smlal2 v5.4s, v4.8h, v29.8h\n"
+    "ldr d4, [x24, x3]\n"
+    "smlal2 v22.4s, v14.8h, v29.8h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v29.8h\n"
+    "ldr d29, [x6, #0xb8]\n"
+    "smlal v20.4s, v17.4h, v21.4h\n"
+    "usubl v29.8h, v29.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v15.4s, v17.8h, v3.8h\n"
+    "smlal v7.4s, v6.4h, v2.4h\n"
+    "smlal2 v5.4s, v17.8h, v21.8h\n"
+    "ldr d17, [x23, x3]\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "ldr d21, [x6, #0xc0]\n"
+    "smlal v20.4s, v6.4h, v3.4h\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v3.4h\n"
+    "smlal v23.4s, v9.4h, v3.4h\n"
+    "add x6, x6, #0xc8\n"
+    "smlal2 v15.4s, v6.8h, v2.8h\n"
+    "smlal v7.4s, v8.4h, v27.4h\n"
+    "smlal2 v5.4s, v6.8h, v3.8h\n"
+    "ldr d6, [x22, x3]\n"
+    "smlal2 v22.4s, v10.8h, v3.8h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v3.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal v20.4s, v28.4h, v2.4h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v24.4s, v9.4h, v2.4h\n"
+    "smlal v23.4s, v12.4h, v2.4h\n"
+    "add x3, x3, #0x8\n"
+    "smlal2 v15.4s, v8.8h, v27.8h\n"
+    "ldr q8, [x7, #0x0]\n"
+    "smlal v7.4s, v14.4h, v30.4h\n"
+    "smlal2 v5.4s, v28.8h, v2.8h\n"
+    "ldr q28, [x8, #0x0]\n"
+    "smlal2 v22.4s, v9.8h, v2.8h\n"
+    "smlal2 v19.4s, v12.8h, v2.8h\n"
+    "ldr q2, [x7, #0x10]\n"
+    "smlal v20.4s, v14.4h, v27.4h\n"
+    "add x7, x7, #0x20\n"
+    "smlal v24.4s, v1.4h, v27.4h\n"
+    "smlal v23.4s, v16.4h, v27.4h\n"
+    "smlal2 v15.4s, v14.8h, v30.8h\n"
+    "smlal v7.4s, v25.4h, v31.4h\n"
+    "smlal2 v5.4s, v14.8h, v27.8h\n"
+    "ldr q14, [x8, #0x10]\n"
+    "smlal2 v22.4s, v1.8h, v27.8h\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v19.4s, v16.8h, v27.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal v24.4s, v16.4h, v30.4h\n"
+    "smlal v23.4s, v4.4h, v30.4h\n"
+    "smlal2 v15.4s, v25.8h, v31.8h\n"
+    "smlal v7.4s, v10.4h, v29.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal2 v22.4s, v16.8h, v30.8h\n"
+    "smlal2 v19.4s, v4.8h, v30.8h\n"
+    "smlal v20.4s, v10.4h, v31.4h\n"
+    "smlal v24.4s, v4.4h, v31.4h\n"
+    "smlal v23.4s, v17.4h, v31.4h\n"
+    "smlal2 v15.4s, v10.8h, v29.8h\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+    "smlal2 v5.4s, v10.8h, v31.8h\n"
+    "smlal2 v22.4s, v4.8h, v31.8h\n"
+    "and v27.16b, v7.16b, v28.16b\n"
+    "smlal2 v19.4s, v17.8h, v31.8h\n"
+    "smlal v20.4s, v9.4h, v29.4h\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "smlal v24.4s, v17.4h, v29.4h\n"
+    "smlal v23.4s, v6.4h, v29.4h\n"
+    "sqadd v7.4s, v7.4s, v27.4s\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal2 v5.4s, v9.8h, v29.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v2.4s\n"
+    "smlal2 v22.4s, v17.8h, v29.8h\n"
+    "smlal2 v19.4s, v6.8h, v29.8h\n"
+    "and v9.16b, v15.16b, v14.16b\n"
+    "smlal v20.4s, v12.4h, v21.4h\n"
+    "smlal v24.4s, v6.4h, v21.4h\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "smlal v23.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v12.8h, v21.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "smlal2 v22.4s, v6.8h, v21.8h\n"
+    "smlal2 v19.4s, v3.8h, v21.8h\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "and v25.16b, v20.16b, v28.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v2.4s\n"
+    "and v10.16b, v24.16b, v28.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+    "and v21.16b, v23.16b, v28.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+    "sqadd v15.4s, v15.4s, v9.4s\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "and v9.16b, v5.16b, v14.16b\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "and v12.16b, v22.16b, v14.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v17.16b, v19.16b, v14.16b\n"
+    "sqadd v20.4s, v20.4s, v25.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v10.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v21.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v28.4s\n"
+    "srshl v20.4s, v20.4s, v28.4s\n"
+    "sqadd v5.4s, v5.4s, v9.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqadd v22.4s, v22.4s, v12.4s\n"
+    "srshl v23.4s, v23.4s, v28.4s\n"
+    "sqadd v19.4s, v19.4s, v17.4s\n"
+    "srshl v15.4s, v15.4s, v14.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v14.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v14.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str d7, [x17, x4]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str d20, [x16, x4]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d24, [x15, x4]\n"
+    "str d23, [x14, x4]\n"
+    "ldr q7, [x20, #0x0]\n"
+    "ldr q15, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "add x4, x4, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "usubl v14.8h, v14.8b, v13.8b\n"
+    "ldr d31, [x9, x3]\n"
+    "ldr d17, [x28, x3]\n"
+    "usubl v10.8h, v10.8b, v13.8b\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "ldr d30, [x27, x3]\n"
+    "ldr d16, [x26, x3]\n"
+    "usubl v12.8h, v12.8b, v13.8b\n"
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "ldr d3, [x25, x3]\n"
+    "ldr d4, [x24, x3]\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "ldr d25, [x23, x3]\n"
+    "ldr d9, [x22, x3]\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "ldr d29, [x21, x3]\n"
+    "ldr d28, [x20, x3]\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr d27, [x6, #0x28]\n"
+    "ldr d1, [x6, #0x30]\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "ldr d2, [x6, #0x38]\n"
+    "ldr d31, [x6, #0x40]\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "ldr d8, [x6, #0x48]\n"
+    "ldr x22, [x5, #0x50]\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "ldr x20, [x5, #0x58]\n"
+    "ldr x21, [x5, #0x60]\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "usubl v27.8h, v27.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x22, [x5, #0x70]\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "ldr d14, [x20, x3]\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal v23.4s, v17.4h, v10.4h\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x21, [x5, #0x78]\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "usubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x20, [x5, #0x80]\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "ldr d25, [x22, x3]\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v10.8h\n"
+    "ldr d10, [x21, x3]\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "smlal v24.4s, v17.4h, v21.4h\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "usubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x24, [x5, #0x88]\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "smlal v7.4s, v30.4h, v27.4h\n"
+    "ldr x23, [x5, #0x90]\n"
+    "ldr x22, [x5, #0x98]\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "ldr d9, [x20, x3]\n"
+    "smlal2 v22.4s, v17.8h, v21.8h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "ldr d21, [x6, #0x50]\n"
+    "smlal v20.4s, v3.4h, v12.4h\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "ldr x21, [x5, #0xa0]\n"
+    "ldr x20, [x5, #0xa8]\n"
+    "smlal2 v15.4s, v30.8h, v27.8h\n"
+    "ldr d30, [x24, x3]\n"
+    "smlal v7.4s, v16.4h, v1.4h\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v12.8h\n"
+    "ldr d3, [x6, #0x58]\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "ldr d12, [x23, x3]\n"
+    "smlal v20.4s, v16.4h, v27.4h\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "smlal v24.4s, v28.4h, v27.4h\n"
+    "smlal v23.4s, v14.4h, v27.4h\n"
+    "ldr x13, [x5, #0xb0]\n"
+    "ldr x12, [x5, #0xb8]\n"
+    "smlal2 v15.4s, v16.8h, v1.8h\n"
+    "smlal v7.4s, v4.4h, v2.4h\n"
+    "ldr x11, [x5, #0xc0]\n"
+    "ldr x10, [x5, #0xc8]\n"
+    "smlal2 v5.4s, v16.8h, v27.8h\n"
+    "ldr d16, [x22, x3]\n"
+    "smlal2 v22.4s, v28.8h, v27.8h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v27.8h\n"
+    "ldr d27, [x6, #0x60]\n"
+    "smlal v20.4s, v4.4h, v1.4h\n"
+    "usubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v1.4h\n"
+    "smlal v23.4s, v25.4h, v1.4h\n"
+    "ldr x9, [x5, #0xd0]\n"
+    "ldr x28, [x5, #0xd8]\n"
+    "smlal2 v15.4s, v4.8h, v2.8h\n"
+    "smlal v7.4s, v17.4h, v31.4h\n"
+    "ldr x27, [x5, #0xe0]\n"
+    "ldr x26, [x5, #0xe8]\n"
+    "smlal2 v5.4s, v4.8h, v1.8h\n"
+    "ldr d4, [x21, x3]\n"
+    "smlal2 v22.4s, v14.8h, v1.8h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v1.8h\n"
+    "ldr d1, [x6, #0x68]\n"
+    "smlal v20.4s, v17.4h, v2.4h\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v2.4h\n"
+    "smlal v23.4s, v10.4h, v2.4h\n"
+    "ldr x25, [x5, #0xf0]\n"
+    "ldr x24, [x5, #0xf8]\n"
+    "smlal2 v15.4s, v17.8h, v31.8h\n"
+    "smlal v7.4s, v6.4h, v8.4h\n"
+    "ldr x23, [x5, #0x100]\n"
+    "ldr x22, [x5, #0x108]\n"
+    "smlal2 v5.4s, v17.8h, v2.8h\n"
+    "ldr d17, [x20, x3]\n"
+    "smlal2 v22.4s, v25.8h, v2.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v2.8h\n"
+    "ldr d2, [x6, #0x70]\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v31.4h\n"
+    "smlal v23.4s, v9.4h, v31.4h\n"
+    "ldr x21, [x5, #0x110]\n"
+    "ldr x20, [x5, #0x118]\n"
+    "smlal2 v15.4s, v6.8h, v8.8h\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "tst x1, #0x7\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "ldr d6, [x13, x3]\n"
+    "smlal2 v22.4s, v10.8h, v31.8h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v31.8h\n"
+    "ldr d31, [x6, #0x78]\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "usubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v9.4h, v8.4h\n"
+    "smlal v23.4s, v30.4h, v8.4h\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "ldr d28, [x12, x3]\n"
+    "smlal v7.4s, v14.4h, v3.4h\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v5.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x6, #0x80]\n"
+    "smlal2 v22.4s, v9.8h, v8.8h\n"
+    "usubl v29.8h, v29.8b, v13.8b\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "ldr d8, [x11, x3]\n"
+    "smlal v20.4s, v14.4h, v21.4h\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "smlal v24.4s, v12.4h, v21.4h\n"
+    "smlal v23.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v14.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v27.4h\n"
+    "smlal2 v5.4s, v14.8h, v21.8h\n"
+    "ldr d14, [x10, x3]\n"
+    "smlal2 v22.4s, v12.8h, v21.8h\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v21.8h\n"
+    "ldr d21, [x6, #0x88]\n"
+    "smlal v20.4s, v25.4h, v3.4h\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v16.4h, v3.4h\n"
+    "smlal v23.4s, v4.4h, v3.4h\n"
+    "smlal2 v15.4s, v25.8h, v27.8h\n"
+    "smlal v7.4s, v10.4h, v1.4h\n"
+    "smlal2 v5.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x9, x3]\n"
+    "smlal2 v22.4s, v16.8h, v3.8h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v3.8h\n"
+    "ldr d3, [x6, #0x90]\n"
+    "smlal v20.4s, v10.4h, v27.4h\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v4.4h, v27.4h\n"
+    "smlal v23.4s, v17.4h, v27.4h\n"
+    "smlal2 v15.4s, v10.8h, v1.8h\n"
+    "smlal v7.4s, v9.4h, v2.4h\n"
+    "smlal2 v5.4s, v10.8h, v27.8h\n"
+    "ldr d10, [x28, x3]\n"
+    "smlal2 v22.4s, v4.8h, v27.8h\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v27.8h\n"
+    "ldr d27, [x6, #0x98]\n"
+    "smlal v20.4s, v9.4h, v1.4h\n"
+    "usubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v17.4h, v1.4h\n"
+    "smlal v23.4s, v6.4h, v1.4h\n"
+    "smlal2 v15.4s, v9.8h, v2.8h\n"
+    "smlal v7.4s, v12.4h, v31.4h\n"
+    "smlal2 v5.4s, v9.8h, v1.8h\n"
+    "ldr d9, [x27, x3]\n"
+    "smlal2 v22.4s, v17.8h, v1.8h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v1.8h\n"
+    "ldr d1, [x6, #0xa0]\n"
+    "smlal v20.4s, v30.4h, v2.4h\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v2.4h\n"
+    "smlal v23.4s, v28.4h, v2.4h\n"
+    "smlal2 v15.4s, v12.8h, v31.8h\n"
+    "ldr d12, [x26, x3]\n"
+    "smlal v7.4s, v16.4h, v29.4h\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "smlal2 v5.4s, v30.8h, v2.8h\n"
+    "ldr d30, [x6, #0xa8]\n"
+    "smlal2 v22.4s, v6.8h, v2.8h\n"
+    "usubl v30.8h, v30.8b, v13.8b\n"
+    "smlal2 v19.4s, v28.8h, v2.8h\n"
+    "ldr d2, [x25, x3]\n"
+    "smlal v20.4s, v16.4h, v31.4h\n"
+    "usubl v2.8h, v2.8b, v18.8b\n"
+    "smlal v24.4s, v8.4h, v31.4h\n"
+    "smlal v23.4s, v14.4h, v31.4h\n"
+    "smlal2 v15.4s, v16.8h, v29.8h\n"
+    "smlal v7.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v16.8h, v31.8h\n"
+    "ldr d16, [x24, x3]\n"
+    "smlal2 v22.4s, v8.8h, v31.8h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v31.8h\n"
+    "ldr d31, [x6, #0xb0]\n"
+    "smlal v20.4s, v4.4h, v29.4h\n"
+    "usubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v29.4h\n"
+    "smlal v23.4s, v25.4h, v29.4h\n"
+    "smlal2 v15.4s, v4.8h, v21.8h\n"
+    "smlal v7.4s, v17.4h, v3.4h\n"
+    "smlal2 v5.4s, v4.8h, v29.8h\n"
+    "ldr d4, [x23, x3]\n"
+    "smlal2 v22.4s, v14.8h, v29.8h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v29.8h\n"
+    "ldr d29, [x6, #0xb8]\n"
+    "smlal v20.4s, v17.4h, v21.4h\n"
+    "usubl v29.8h, v29.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v15.4s, v17.8h, v3.8h\n"
+    "smlal v7.4s, v6.4h, v27.4h\n"
+    "smlal2 v5.4s, v17.8h, v21.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "ldr d21, [x6, #0xc0]\n"
+    "smlal v20.4s, v6.4h, v3.4h\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v3.4h\n"
+    "smlal v23.4s, v9.4h, v3.4h\n"
+    "smlal2 v15.4s, v6.8h, v27.8h\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "smlal2 v5.4s, v6.8h, v3.8h\n"
+    "ldr d6, [x21, x3]\n"
+    "smlal2 v22.4s, v10.8h, v3.8h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v3.8h\n"
+    "ldr d3, [x20, x3]\n"
+    "smlal v20.4s, v28.4h, v27.4h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v24.4s, v9.4h, v27.4h\n"
+    "smlal v23.4s, v12.4h, v27.4h\n"
+    "add x3, x3, #0x8\n"
+    "smlal2 v15.4s, v8.8h, v1.8h\n"
+    "ldr q8, [x7, #0x0]\n"
+    "smlal v7.4s, v14.4h, v30.4h\n"
+    "smlal2 v5.4s, v28.8h, v27.8h\n"
+    "ldr q28, [x8, #0x0]\n"
+    "smlal2 v22.4s, v9.8h, v27.8h\n"
+    "smlal2 v19.4s, v12.8h, v27.8h\n"
+    "ldr q27, [x7, #0x10]\n"
+    "smlal v20.4s, v14.4h, v1.4h\n"
+    "add x7, x7, #0x20\n"
+    "smlal v24.4s, v2.4h, v1.4h\n"
+    "smlal v23.4s, v16.4h, v1.4h\n"
+    "smlal2 v15.4s, v14.8h, v30.8h\n"
+    "smlal v7.4s, v25.4h, v31.4h\n"
+    "smlal2 v5.4s, v14.8h, v1.8h\n"
+    "ldr q14, [x8, #0x10]\n"
+    "smlal2 v22.4s, v2.8h, v1.8h\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v19.4s, v16.8h, v1.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal v24.4s, v16.4h, v30.4h\n"
+    "smlal v23.4s, v4.4h, v30.4h\n"
+    "smlal2 v15.4s, v25.8h, v31.8h\n"
+    "smlal v7.4s, v10.4h, v29.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal2 v22.4s, v16.8h, v30.8h\n"
+    "smlal2 v19.4s, v4.8h, v30.8h\n"
+    "smlal v20.4s, v10.4h, v31.4h\n"
+    "smlal v24.4s, v4.4h, v31.4h\n"
+    "smlal v23.4s, v17.4h, v31.4h\n"
+    "smlal2 v15.4s, v10.8h, v29.8h\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+    "smlal2 v5.4s, v10.8h, v31.8h\n"
+    "smlal2 v22.4s, v4.8h, v31.8h\n"
+    "and v4.16b, v7.16b, v28.16b\n"
+    "smlal2 v19.4s, v17.8h, v31.8h\n"
+    "smlal v20.4s, v9.4h, v29.4h\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "smlal v24.4s, v17.4h, v29.4h\n"
+    "smlal v23.4s, v6.4h, v29.4h\n"
+    "sqadd v7.4s, v7.4s, v4.4s\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal2 v5.4s, v9.8h, v29.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v27.4s\n"
+    "smlal2 v22.4s, v17.8h, v29.8h\n"
+    "smlal2 v19.4s, v6.8h, v29.8h\n"
+    "and v30.16b, v15.16b, v14.16b\n"
+    "smlal v20.4s, v12.4h, v21.4h\n"
+    "smlal v24.4s, v6.4h, v21.4h\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "smlal v23.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v12.8h, v21.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "smlal2 v22.4s, v6.8h, v21.8h\n"
+    "smlal2 v19.4s, v3.8h, v21.8h\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "and v3.16b, v20.16b, v28.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v27.4s\n"
+    "and v25.16b, v24.16b, v28.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+    "and v16.16b, v23.16b, v28.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+    "sqadd v15.4s, v15.4s, v30.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "and v4.16b, v5.16b, v14.16b\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "and v10.16b, v22.16b, v14.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v12.16b, v19.16b, v14.16b\n"
+    "sqadd v20.4s, v20.4s, v3.4s\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v25.4s\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v28.4s\n"
+    "srshl v20.4s, v20.4s, v28.4s\n"
+    "sqadd v5.4s, v5.4s, v4.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqadd v22.4s, v22.4s, v10.4s\n"
+    "srshl v23.4s, v23.4s, v28.4s\n"
+    "sqadd v19.4s, v19.4s, v12.4s\n"
+    "srshl v15.4s, v15.4s, v14.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v14.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v14.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str d7, [x17, x4]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str d20, [x16, x4]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d24, [x15, x4]\n"
+    "str d23, [x14, x4]\n"
+    "add x4, x4, #0x8\n"
+    "beq 124f\n"
+    "add x6, x6, #0xc8\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x1, #2, 5f\n"
+    "ld1 { v7.4s }, [x20], #0x10\n"
+    "tbz x1, #1, 4f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz x1, #0, 7f\n"
+    "ld1 { v15.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 7f\n"
+    "ld1 { v15.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x1, #1, 6f\n"
+    "ld1 { v7.d }[0], [x20], #0x8\n"
+    "tbz x1, #0, 7f\n"
+    "ld1 { v7.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 7f\n"
+    "ld1 { v7.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "usubl v14.8h, v14.8b, v13.8b\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "usubl v10.8h, v10.8b, v13.8b\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "usubl v12.8h, v12.8b, v13.8b\n"
+    "add x9, x9, x3\n"
+    "add x28, x28, x3\n"
+    "add x27, x27, x3\n"
+    "add x26, x26, x3\n"
+    "add x25, x25, x3\n"
+    "add x24, x24, x3\n"
+    "add x23, x23, x3\n"
+    "add x22, x22, x3\n"
+    "add x21, x21, x3\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 9f\n"
+    "ld1 { v31.s }[0], [x9], #0x4\n"
+    "ld1 { v17.s }[0], [x28], #0x4\n"
+    "ld1 { v30.s }[0], [x27], #0x4\n"
+    "ld1 { v16.s }[0], [x26], #0x4\n"
+    "ld1 { v3.s }[0], [x25], #0x4\n"
+    "ld1 { v4.s }[0], [x24], #0x4\n"
+    "ld1 { v25.s }[0], [x23], #0x4\n"
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v29.s }[0], [x21], #0x4\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 8f\n"
+    "ld1 { v31.h }[2], [x9], #0x2\n"
+    "ld1 { v17.h }[2], [x28], #0x2\n"
+    "ld1 { v30.h }[2], [x27], #0x2\n"
+    "ld1 { v16.h }[2], [x26], #0x2\n"
+    "ld1 { v3.h }[2], [x25], #0x2\n"
+    "ld1 { v4.h }[2], [x24], #0x2\n"
+    "ld1 { v25.h }[2], [x23], #0x2\n"
+    "ld1 { v9.h }[2], [x22], #0x2\n"
+    "ld1 { v29.h }[2], [x21], #0x2\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[6], [x9]\n"
+    "ld1 { v17.b }[6], [x28]\n"
+    "ld1 { v30.b }[6], [x27]\n"
+    "ld1 { v16.b }[6], [x26]\n"
+    "ld1 { v3.b }[6], [x25]\n"
+    "ld1 { v4.b }[6], [x24]\n"
+    "ld1 { v25.b }[6], [x23]\n"
+    "ld1 { v9.b }[6], [x22]\n"
+    "ld1 { v29.b }[6], [x21]\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[4], [x9]\n"
+    "ld1 { v17.b }[4], [x28]\n"
+    "ld1 { v30.b }[4], [x27]\n"
+    "ld1 { v16.b }[4], [x26]\n"
+    "ld1 { v3.b }[4], [x25]\n"
+    "ld1 { v4.b }[4], [x24]\n"
+    "ld1 { v25.b }[4], [x23]\n"
+    "ld1 { v9.b }[4], [x22]\n"
+    "ld1 { v29.b }[4], [x21]\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x1, #1, 10f\n"
+    "ld1 { v31.h }[0], [x9], #0x2\n"
+    "ld1 { v17.h }[0], [x28], #0x2\n"
+    "ld1 { v30.h }[0], [x27], #0x2\n"
+    "ld1 { v16.h }[0], [x26], #0x2\n"
+    "ld1 { v3.h }[0], [x25], #0x2\n"
+    "ld1 { v4.h }[0], [x24], #0x2\n"
+    "ld1 { v25.h }[0], [x23], #0x2\n"
+    "ld1 { v9.h }[0], [x22], #0x2\n"
+    "ld1 { v29.h }[0], [x21], #0x2\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[2], [x9]\n"
+    "ld1 { v17.b }[2], [x28]\n"
+    "ld1 { v30.b }[2], [x27]\n"
+    "ld1 { v16.b }[2], [x26]\n"
+    "ld1 { v3.b }[2], [x25]\n"
+    "ld1 { v4.b }[2], [x24]\n"
+    "ld1 { v25.b }[2], [x23]\n"
+    "ld1 { v9.b }[2], [x22]\n"
+    "ld1 { v29.b }[2], [x21]\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[0], [x9]\n"
+    "ld1 { v17.b }[0], [x28]\n"
+    "ld1 { v30.b }[0], [x27]\n"
+    "ld1 { v16.b }[0], [x26]\n"
+    "ld1 { v3.b }[0], [x25]\n"
+    "ld1 { v4.b }[0], [x24]\n"
+    "ld1 { v25.b }[0], [x23]\n"
+    "ld1 { v9.b }[0], [x22]\n"
+    "ld1 { v29.b }[0], [x21]\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "ldr x20, [x5, #0x50]\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "add x20, x20, x3\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "tbz x1, #2, 13f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 12f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x1, #1, 14f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "usubl v27.8h, v27.8b, v18.8b\n"
+    "ldr x20, [x5, #0x58]\n"
+    "smlal v23.4s, v27.4h, v10.4h\n"
+    "smlal2 v19.4s, v27.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "smlal v24.4s, v27.4h, v21.4h\n"
+    "smlal2 v22.4s, v27.8h, v21.8h\n"
+    "tbz x1, #2, 17f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 16f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x1, #1, 18f\n"
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "ldr x20, [x5, #0x60]\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 21f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 20f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 5): Bit 2: Unset
+    "tbz x1, #1, 22f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (0, 5): Bit 2: End
+    "ldr d14, [x6, #0x28]\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal v20.4s, v9.4h, v12.4h\n"
+    "smlal2 v5.4s, v9.8h, v12.8h\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "usubl v14.8h, v14.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v30.4h, v14.4h\n"
+    "smlal2 v15.4s, v30.8h, v14.8h\n"
+    "smlal v20.4s, v16.4h, v14.4h\n"
+    "smlal2 v5.4s, v16.8h, v14.8h\n"
+    "smlal v24.4s, v28.4h, v14.4h\n"
+    "smlal2 v22.4s, v28.8h, v14.8h\n"
+    "tbz x1, #2, 25f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 24f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x1, #1, 26f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (2, 1): Bit 2: End
+    "ldr d21, [x6, #0x30]\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0x70]\n"
+    "smlal v23.4s, v25.4h, v14.4h\n"
+    "smlal2 v19.4s, v25.8h, v14.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v16.8h, v21.8h\n"
+    "smlal v20.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v4.8h, v21.8h\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "tbz x1, #2, 29f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 28f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x1, #1, 30f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "ldr d9, [x6, #0x38]\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "usubl v9.8h, v9.8b, v13.8b\n"
+    "ldr x20, [x5, #0x78]\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v4.4h, v9.4h\n"
+    "smlal2 v15.4s, v4.8h, v9.8h\n"
+    "smlal v20.4s, v27.4h, v9.4h\n"
+    "smlal2 v5.4s, v27.8h, v9.8h\n"
+    "smlal v24.4s, v10.4h, v9.4h\n"
+    "smlal2 v22.4s, v10.8h, v9.8h\n"
+    "tbz x1, #2, 33f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 32f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x1, #1, 34f\n"
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (2, 3): Bit 2: End
+    "ldr d31, [x6, #0x40]\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "usubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x20, [x5, #0x80]\n"
+    "smlal v23.4s, v12.4h, v9.4h\n"
+    "smlal2 v19.4s, v12.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v27.4h, v31.4h\n"
+    "smlal2 v15.4s, v27.8h, v31.8h\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "smlal v24.4s, v12.4h, v31.4h\n"
+    "smlal2 v22.4s, v12.8h, v31.8h\n"
+    "tbz x1, #2, 37f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 36f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x1, #1, 38f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 4): Bit 2: End
+    "ldr d16, [x6, #0x48]\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "usubl v16.8h, v16.8b, v13.8b\n"
+    "ldr x20, [x5, #0x88]\n"
+    "smlal v23.4s, v8.4h, v31.4h\n"
+    "smlal2 v19.4s, v8.8h, v31.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v6.4h, v16.4h\n"
+    "smlal2 v15.4s, v6.8h, v16.8h\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "smlal2 v5.4s, v29.8h, v16.8h\n"
+    "smlal v24.4s, v8.4h, v16.4h\n"
+    "smlal2 v22.4s, v8.8h, v16.8h\n"
+    "tbz x1, #2, 41f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 40f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 5): Bit 2: Unset
+    "tbz x1, #1, 42f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (2, 5): Bit 2: End
+    "ldr d21, [x6, #0x50]\n"
+    "usubl v27.8h, v27.8b, v18.8b\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0x90]\n"
+    "smlal v23.4s, v27.4h, v16.4h\n"
+    "smlal2 v19.4s, v27.8h, v16.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "smlal v20.4s, v25.4h, v21.4h\n"
+    "smlal2 v5.4s, v25.8h, v21.8h\n"
+    "tbz x1, #2, 45f\n"
+    "ld1 { v31.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 44f\n"
+    "ld1 { v31.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x1, #1, 46f\n"
+    "ld1 { v31.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (3, 0): Bit 2: End
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "ldr x20, [x5, #0x98]\n"
+    "smlal v24.4s, v31.4h, v21.4h\n"
+    "smlal2 v22.4s, v31.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 49f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 48f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x1, #1, 50f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "ldr d2, [x6, #0x58]\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x20, [x5, #0xa0]\n"
+    "smlal v23.4s, v28.4h, v21.4h\n"
+    "smlal2 v19.4s, v28.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v25.4h, v2.4h\n"
+    "smlal2 v15.4s, v25.8h, v2.8h\n"
+    "smlal v20.4s, v10.4h, v2.4h\n"
+    "smlal2 v5.4s, v10.8h, v2.8h\n"
+    "smlal v24.4s, v28.4h, v2.4h\n"
+    "smlal2 v22.4s, v28.8h, v2.8h\n"
+    "tbz x1, #2, 53f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 52f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x1, #1, 54f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "ldr d25, [x6, #0x60]\n"
+    "usubl v21.8h, v21.8b, v18.8b\n"
+    "usubl v25.8h, v25.8b, v13.8b\n"
+    "ldr x20, [x5, #0xa8]\n"
+    "smlal v23.4s, v21.4h, v2.4h\n"
+    "smlal2 v19.4s, v21.8h, v2.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v10.4h, v25.4h\n"
+    "smlal2 v15.4s, v10.8h, v25.8h\n"
+    "smlal v20.4s, v12.4h, v25.4h\n"
+    "smlal2 v5.4s, v12.8h, v25.8h\n"
+    "smlal v24.4s, v21.4h, v25.4h\n"
+    "smlal2 v22.4s, v21.8h, v25.8h\n"
+    "tbz x1, #2, 57f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 56f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x1, #1, 58f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "59:"  // Oddments: Load (3, 3): Bit 2: End
+    "ldr d1, [x6, #0x68]\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x20, [x5, #0xb0]\n"
+    "smlal v23.4s, v9.4h, v25.4h\n"
+    "smlal2 v19.4s, v9.8h, v25.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v12.4h, v1.4h\n"
+    "smlal2 v15.4s, v12.8h, v1.8h\n"
+    "smlal v20.4s, v8.4h, v1.4h\n"
+    "smlal2 v5.4s, v8.8h, v1.8h\n"
+    "smlal v24.4s, v9.4h, v1.4h\n"
+    "smlal2 v22.4s, v9.8h, v1.8h\n"
+    "tbz x1, #2, 61f\n"
+    "ld1 { v3.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 60f\n"
+    "ld1 { v3.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[6], [x20]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[4], [x20]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x1, #1, 62f\n"
+    "ld1 { v3.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[2], [x20]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[0], [x20]\n"
+    "63:"  // Oddments: Load (3, 4): Bit 2: End
+    "ldr d16, [x6, #0x70]\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "usubl v16.8h, v16.8b, v13.8b\n"
+    "ldr x20, [x5, #0xb8]\n"
+    "smlal v23.4s, v3.4h, v1.4h\n"
+    "smlal2 v19.4s, v3.8h, v1.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "smlal2 v15.4s, v8.8h, v16.8h\n"
+    "smlal v20.4s, v27.4h, v16.4h\n"
+    "smlal2 v5.4s, v27.8h, v16.8h\n"
+    "smlal v24.4s, v3.4h, v16.4h\n"
+    "smlal2 v22.4s, v3.8h, v16.8h\n"
+    "tbz x1, #2, 65f\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 64f\n"
+    "ld1 { v14.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[6], [x20]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[4], [x20]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 5): Bit 2: Unset
+    "tbz x1, #1, 66f\n"
+    "ld1 { v14.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[2], [x20]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[0], [x20]\n"
+    "67:"  // Oddments: Load (3, 5): Bit 2: End
+    "ldr d17, [x6, #0x78]\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "usubl v17.8h, v17.8b, v13.8b\n"
+    "ldr x20, [x5, #0xc0]\n"
+    "smlal v23.4s, v14.4h, v16.4h\n"
+    "smlal2 v19.4s, v14.8h, v16.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v31.4h, v17.4h\n"
+    "smlal2 v15.4s, v31.8h, v17.8h\n"
+    "smlal v20.4s, v28.4h, v17.4h\n"
+    "smlal2 v5.4s, v28.8h, v17.8h\n"
+    "tbz x1, #2, 69f\n"
+    "ld1 { v1.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 68f\n"
+    "ld1 { v1.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[6], [x20]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[4], [x20]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x1, #1, 70f\n"
+    "ld1 { v1.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[0], [x20]\n"
+    "71:"  // Oddments: Load (4, 0): Bit 2: End
+    "usubl v1.8h, v1.8b, v18.8b\n"
+    "ldr x20, [x5, #0xc8]\n"
+    "smlal v24.4s, v1.4h, v17.4h\n"
+    "smlal2 v22.4s, v1.8h, v17.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 73f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 72f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x1, #1, 74f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 1): Bit 2: End
+    "ldr d29, [x6, #0x80]\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "usubl v29.8h, v29.8b, v13.8b\n"
+    "ldr x20, [x5, #0xd0]\n"
+    "smlal v23.4s, v16.4h, v17.4h\n"
+    "smlal2 v19.4s, v16.8h, v17.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v28.4h, v29.4h\n"
+    "smlal2 v15.4s, v28.8h, v29.8h\n"
+    "smlal v20.4s, v21.4h, v29.4h\n"
+    "smlal2 v5.4s, v21.8h, v29.8h\n"
+    "smlal v24.4s, v16.4h, v29.4h\n"
+    "smlal2 v22.4s, v16.8h, v29.8h\n"
+    "tbz x1, #2, 77f\n"
+    "ld1 { v30.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 76f\n"
+    "ld1 { v30.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[6], [x20]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[4], [x20]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x1, #1, 78f\n"
+    "ld1 { v30.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[2], [x20]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[0], [x20]\n"
+    "79:"  // Oddments: Load (4, 2): Bit 2: End
+    "ldr d12, [x6, #0x88]\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "usubl v12.8h, v12.8b, v13.8b\n"
+    "ldr x20, [x5, #0xd8]\n"
+    "smlal v23.4s, v30.4h, v29.4h\n"
+    "smlal2 v19.4s, v30.8h, v29.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v21.4h, v12.4h\n"
+    "smlal2 v15.4s, v21.8h, v12.8h\n"
+    "smlal v20.4s, v9.4h, v12.4h\n"
+    "smlal2 v5.4s, v9.8h, v12.8h\n"
+    "smlal v24.4s, v30.4h, v12.4h\n"
+    "smlal2 v22.4s, v30.8h, v12.8h\n"
+    "tbz x1, #2, 81f\n"
+    "ld1 { v29.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 80f\n"
+    "ld1 { v29.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[6], [x20]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[4], [x20]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x1, #1, 82f\n"
+    "ld1 { v29.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[2], [x20]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[0], [x20]\n"
+    "83:"  // Oddments: Load (4, 3): Bit 2: End
+    "ldr d21, [x6, #0x90]\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0xe0]\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal v20.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v3.8h, v21.8h\n"
+    "smlal v24.4s, v29.4h, v21.4h\n"
+    "smlal2 v22.4s, v29.8h, v21.8h\n"
+    "tbz x1, #2, 85f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 84f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 87f\n"
+    "84:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 87f\n"
+    "85:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x1, #1, 86f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 87f\n"
+    "86:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "87:"  // Oddments: Load (4, 4): Bit 2: End
+    "ldr d8, [x6, #0x98]\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "usubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x20, [x5, #0xe8]\n"
+    "smlal v23.4s, v25.4h, v21.4h\n"
+    "smlal2 v19.4s, v25.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v3.4h, v8.4h\n"
+    "smlal2 v15.4s, v3.8h, v8.8h\n"
+    "smlal v20.4s, v14.4h, v8.4h\n"
+    "smlal2 v5.4s, v14.8h, v8.8h\n"
+    "smlal v24.4s, v25.4h, v8.4h\n"
+    "smlal2 v22.4s, v25.8h, v8.8h\n"
+    "tbz x1, #2, 89f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 88f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 91f\n"
+    "88:"  // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 91f\n"
+    "89:"  // Oddments: Load (4, 5): Bit 2: Unset
+    "tbz x1, #1, 90f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 91f\n"
+    "90:"  // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "91:"  // Oddments: Load (4, 5): Bit 2: End
+    "ldr d9, [x6, #0xa0]\n"
+    "usubl v21.8h, v21.8b, v18.8b\n"
+    "usubl v9.8h, v9.8b, v13.8b\n"
+    "ldr x20, [x5, #0xf0]\n"
+    "smlal v23.4s, v21.4h, v8.4h\n"
+    "smlal2 v19.4s, v21.8h, v8.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v1.4h, v9.4h\n"
+    "smlal2 v15.4s, v1.8h, v9.8h\n"
+    "smlal v20.4s, v16.4h, v9.4h\n"
+    "smlal2 v5.4s, v16.8h, v9.8h\n"
+    "tbz x1, #2, 93f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 92f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[6], [x20]\n"
+    "b 95f\n"
+    "92:"  // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[4], [x20]\n"
+    "b 95f\n"
+    "93:"  // Oddments: Load (5, 0): Bit 2: Unset
+    "tbz x1, #1, 94f\n"
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[2], [x20]\n"
+    "b 95f\n"
+    "94:"  // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[0], [x20]\n"
+    "95:"  // Oddments: Load (5, 0): Bit 2: End
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "ldr x20, [x5, #0xf8]\n"
+    "smlal v24.4s, v12.4h, v9.4h\n"
+    "smlal2 v22.4s, v12.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 97f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 96f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 99f\n"
+    "96:"  // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 99f\n"
+    "97:"  // Oddments: Load (5, 1): Bit 2: Unset
+    "tbz x1, #1, 98f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 99f\n"
+    "98:"  // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "99:"  // Oddments: Load (5, 1): Bit 2: End
+    "ldr d12, [x6, #0xa8]\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "usubl v12.8h, v12.8b, v13.8b\n"
+    "ldr x20, [x5, #0x100]\n"
+    "smlal v23.4s, v10.4h, v9.4h\n"
+    "smlal2 v19.4s, v10.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v16.4h, v12.4h\n"
+    "smlal2 v15.4s, v16.8h, v12.8h\n"
+    "smlal v20.4s, v30.4h, v12.4h\n"
+    "smlal2 v5.4s, v30.8h, v12.8h\n"
+    "smlal v24.4s, v10.4h, v12.4h\n"
+    "smlal2 v22.4s, v10.8h, v12.8h\n"
+    "tbz x1, #2, 101f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 100f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 103f\n"
+    "100:"  // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 103f\n"
+    "101:"  // Oddments: Load (5, 2): Bit 2: Unset
+    "tbz x1, #1, 102f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 103f\n"
+    "102:"  // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "103:"  // Oddments: Load (5, 2): Bit 2: End
+    "ldr d28, [x6, #0xb0]\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "usubl v28.8h, v28.8b, v13.8b\n"
+    "ldr x20, [x5, #0x108]\n"
+    "smlal v23.4s, v9.4h, v12.4h\n"
+    "smlal2 v19.4s, v9.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v30.4h, v28.4h\n"
+    "smlal2 v15.4s, v30.8h, v28.8h\n"
+    "smlal v20.4s, v29.4h, v28.4h\n"
+    "smlal2 v5.4s, v29.8h, v28.8h\n"
+    "smlal v24.4s, v9.4h, v28.4h\n"
+    "smlal2 v22.4s, v9.8h, v28.8h\n"
+    "tbz x1, #2, 105f\n"
+    "ld1 { v2.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 104f\n"
+    "ld1 { v2.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[6], [x20]\n"
+    "b 107f\n"
+    "104:"  // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[4], [x20]\n"
+    "b 107f\n"
+    "105:"  // Oddments: Load (5, 3): Bit 2: Unset
+    "tbz x1, #1, 106f\n"
+    "ld1 { v2.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[2], [x20]\n"
+    "b 107f\n"
+    "106:"  // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[0], [x20]\n"
+    "107:"  // Oddments: Load (5, 3): Bit 2: End
+    "ldr d30, [x6, #0xb8]\n"
+    "usubl v2.8h, v2.8b, v18.8b\n"
+    "usubl v30.8h, v30.8b, v13.8b\n"
+    "ldr x20, [x5, #0x110]\n"
+    "smlal v23.4s, v2.4h, v28.4h\n"
+    "smlal2 v19.4s, v2.8h, v28.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v29.4h, v30.4h\n"
+    "smlal2 v15.4s, v29.8h, v30.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal v24.4s, v2.4h, v30.4h\n"
+    "smlal2 v22.4s, v2.8h, v30.8h\n"
+    "tbz x1, #2, 109f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 108f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 111f\n"
+    "108:"  // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 111f\n"
+    "109:"  // Oddments: Load (5, 4): Bit 2: Unset
+    "tbz x1, #1, 110f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 111f\n"
+    "110:"  // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "111:"  // Oddments: Load (5, 4): Bit 2: End
+    "ldr d8, [x6, #0xc0]\n"
+    "usubl v27.8h, v27.8b, v18.8b\n"
+    "usubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x20, [x5, #0x118]\n"
+    "smlal v23.4s, v27.4h, v30.4h\n"
+    "smlal2 v19.4s, v27.8h, v30.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v25.4h, v8.4h\n"
+    "smlal2 v15.4s, v25.8h, v8.8h\n"
+    "smlal v20.4s, v21.4h, v8.4h\n"
+    "smlal2 v5.4s, v21.8h, v8.8h\n"
+    "smlal v24.4s, v27.4h, v8.4h\n"
+    "smlal2 v22.4s, v27.8h, v8.8h\n"
+    "tbz x1, #2, 113f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 112f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 115f\n"
+    "112:"  // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 115f\n"
+    "113:"  // Oddments: Load (5, 5): Bit 2: Unset
+    "tbz x1, #1, 114f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 115f\n"
+    "114:"  // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "115:"  // Oddments: Load (5, 5): Bit 2: End
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal v23.4s, v9.4h, v8.4h\n"
+    "smlal2 v19.4s, v9.8h, v8.8h\n"
+    "tbz x1, #2, 117f\n"
+    "ld1 { v30.4s }, [x7], #0x10\n"
+    "ld1 { v12.4s }, [x8], #0x10\n"
+    "tbz x1, #1, 116f\n"
+    "ld1 { v14.d }[0], [x7], #0x8\n"
+    "ld1 { v27.d }[0], [x8], #0x8\n"
+    "tbz x1, #0, 119f\n"
+    "ld1 { v14.s }[2], [x7]\n"
+    "ld1 { v27.s }[2], [x8]\n"
+    "b 119f\n"
+    "116:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 119f\n"
+    "ld1 { v14.s }[0], [x7]\n"
+    "ld1 { v27.s }[0], [x8]\n"
+    "b 119f\n"
+    "117:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x1, #1, 118f\n"
+    "ld1 { v30.d }[0], [x7], #0x8\n"
+    "ld1 { v12.d }[0], [x8], #0x8\n"
+    "tbz x1, #0, 119f\n"
+    "ld1 { v30.s }[2], [x7]\n"
+    "ld1 { v12.s }[2], [x8]\n"
+    "b 119f\n"
+    "118:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 119f\n"
+    "ld1 { v30.s }[0], [x7]\n"
+    "ld1 { v12.s }[0], [x8]\n"
+    "119:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v7.4s, v7.4s, v30.4s\n"
+    "and v16.16b, v7.16b, v12.16b\n"
+    "add x17, x17, x4\n"
+    "add x16, x16, x4\n"
+    "sqrdmulh v15.4s, v15.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add x15, x15, x4\n"
+    "add x14, x14, x4\n"
+    "and v2.16b, v15.16b, v27.16b\n"
+    "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+    "sqadd v7.4s, v7.4s, v16.4s\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "and v21.16b, v20.16b, v12.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+    "and v18.16b, v24.16b, v12.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v14.4s\n"
+    "and v31.16b, v23.16b, v12.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+    "sqadd v15.4s, v15.4s, v2.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v9.16b, v5.16b, v27.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "and v4.16b, v22.16b, v27.16b\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v28.16b, v19.16b, v27.16b\n"
+    "sqadd v20.4s, v20.4s, v21.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v18.4s\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v31.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v12.4s\n"
+    "srshl v20.4s, v20.4s, v12.4s\n"
+    "sqadd v5.4s, v5.4s, v9.4s\n"
+    "srshl v24.4s, v24.4s, v12.4s\n"
+    "sqadd v22.4s, v22.4s, v4.4s\n"
+    "srshl v23.4s, v23.4s, v12.4s\n"
+    "sqadd v19.4s, v19.4s, v28.4s\n"
+    "srshl v15.4s, v15.4s, v27.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v27.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v27.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v27.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "tbz x1, #2, 121f\n"
+    "st1 { v7.s }[0], [x17], #0x4\n"
+    "st1 { v20.s }[0], [x16], #0x4\n"
+    "st1 { v24.s }[0], [x15], #0x4\n"
+    "st1 { v23.s }[0], [x14], #0x4\n"
+    "tbz x1, #1, 120f\n"
+    "st1 { v7.h }[2], [x17], #0x2\n"
+    "st1 { v20.h }[2], [x16], #0x2\n"
+    "st1 { v24.h }[2], [x15], #0x2\n"
+    "st1 { v23.h }[2], [x14], #0x2\n"
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[6], [x17], #0x1\n"
+    "st1 { v20.b }[6], [x16], #0x1\n"
+    "st1 { v24.b }[6], [x15], #0x1\n"
+    "st1 { v23.b }[6], [x14], #0x1\n"
+    "b 123f\n"
+    "120:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[4], [x17], #0x1\n"
+    "st1 { v20.b }[4], [x16], #0x1\n"
+    "st1 { v24.b }[4], [x15], #0x1\n"
+    "st1 { v23.b }[4], [x14], #0x1\n"
+    "b 123f\n"
+    "121:"  // Oddments: Bit 2: Unset
+    "tbz x1, #1, 122f\n"
+    "st1 { v7.h }[0], [x17], #0x2\n"
+    "st1 { v20.h }[0], [x16], #0x2\n"
+    "st1 { v24.h }[0], [x15], #0x2\n"
+    "st1 { v23.h }[0], [x14], #0x2\n"
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[2], [x17], #0x1\n"
+    "st1 { v20.b }[2], [x16], #0x1\n"
+    "st1 { v24.b }[2], [x15], #0x1\n"
+    "st1 { v23.b }[2], [x14], #0x1\n"
+    "b 123f\n"
+    "122:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[0], [x17], #0x1\n"
+    "st1 { v20.b }[0], [x16], #0x1\n"
+    "st1 { v24.b }[0], [x15], #0x1\n"
+    "st1 { v23.b }[0], [x14], #0x1\n"
+    "123:"  // Oddments: Bit 2: End
+    "124:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..814efe006e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+class a64_u8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  KernelType kernel = a64_u8q_nhwc_generic_output9_mla_depthfirst_impl;
+
+  public:
+  a64_u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<uint8_t, uint8_t, uint8_t, int32_t>(9, arm_gemm::VLType::None) {}
+
+  KernelType get_kernel() const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..f7aa889b56
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  const arm_gemm::Requantize32& qp,
+  const unsigned int n_points,
+  const unsigned int n_channels
+)
+{
+  __asm__ __volatile__(
+    "lsr x9, %x[n_channels], #0x2\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v8.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v7.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v6.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v5.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v4.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "ld1r { v3.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v2.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "ld1r { v1.4s }, [x20]\n"
+    "mov x11, #0x0\n"
+    "cbz x9, 6f\n"
+    "1:"  // Channel loop
+    "movi v23.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x20, x11, #0x2\n"
+    "ldr q23, [%x[bias], x20]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "ldr s0, [%x[params]], #0x4\n"
+    "mov x25, %x[inptrs]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "subs x24, %x[n_points], #0x1\n"
+    "ldr s14, [x21, x11]\n"
+    "ldr s15, [x20, x11]\n"
+    "mov v24.16b, v23.16b\n"
+    "mov v25.16b, v23.16b\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s16, [x21, x11]\n"
+    "mov v26.16b, v23.16b\n"
+    "mov v27.16b, v23.16b\n"
+    "ldr s17, [x20, x11]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "ldr s18, [x21, x11]\n"
+    "ldr s19, [x20, x11]\n"
+    "mov v30.16b, v23.16b\n"
+    "mov v31.16b, v23.16b\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s20, [x21, x11]\n"
+    "usubl v0.8h, v0.8b, v5.8b\n"
+    "usubl v14.8h, v14.8b, v6.8b\n"
+    "ldr s21, [x20, x11]\n"
+    "ldr x20, [x25], #0x8\n"
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr s22, [x20, x11]\n"
+    "usubl v17.8h, v17.8b, v6.8b\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "usubl v19.8h, v19.8b, v6.8b\n"
+    "usubl v20.8h, v20.8b, v6.8b\n"
+    "usubl v21.8h, v21.8b, v6.8b\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "ldp x23, x22, [x25], #0x10\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "ldr s14, [x23, x11]\n"
+    "ldr s15, [x22, x11]\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "ldr s16, [x21, x11]\n"
+    "ldr s17, [x20, x11]\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s18, [x21, x11]\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "ldr s19, [x20, x11]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "subs x24, x24, #0x1\n"
+    "ldr s0, [%x[params]], #0x4\n"
+    "ldr s20, [x21, x11]\n"
+    "usubl v0.8h, v0.8b, v5.8b\n"
+    "usubl v14.8h, v14.8b, v6.8b\n"
+    "ldr s21, [x20, x11]\n"
+    "ldr x20, [x25], #0x8\n"
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr s22, [x20, x11]\n"
+    "usubl v17.8h, v17.8b, v6.8b\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "usubl v19.8h, v19.8b, v6.8b\n"
+    "usubl v20.8h, v20.8b, v6.8b\n"
+    "usubl v21.8h, v21.8b, v6.8b\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "cbz %x[rq_mul_ptr], 5f\n"
+    "lsl x20, x11, #0x2\n"
+    "ldr q2, [%x[rq_mul_ptr], x20]\n"
+    "ldr q1, [%x[rq_right_shift_ptr], x20]\n"
+    "cbz %x[rq_left_shift_ptr], 5f\n"
+    "ldr q3, [%x[rq_left_shift_ptr], x20]\n"
+    "5:"  // Channel loop: Load quantisation parameters: Done
+    "sshl v23.4s, v23.4s, v3.4s\n"
+    "sshl v24.4s, v24.4s, v3.4s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "sshl v25.4s, v25.4s, v3.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "and v18.16b, v23.16b, v1.16b\n"
+    "and v17.16b, v24.16b, v1.16b\n"
+    "and v16.16b, v25.16b, v1.16b\n"
+    "sshl v26.4s, v26.4s, v3.4s\n"
+    "sshl v27.4s, v27.4s, v3.4s\n"
+    "sshl v28.4s, v28.4s, v3.4s\n"
+    "sshl v29.4s, v29.4s, v3.4s\n"
+    "sshl v30.4s, v30.4s, v3.4s\n"
+    "sshl v31.4s, v31.4s, v3.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+    "sqadd v23.4s, v23.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v21.16b, v26.16b, v1.16b\n"
+    "and v20.16b, v27.16b, v1.16b\n"
+    "and v19.16b, v28.16b, v1.16b\n"
+    "and v18.16b, v29.16b, v1.16b\n"
+    "and v17.16b, v30.16b, v1.16b\n"
+    "and v16.16b, v31.16b, v1.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v21.4s\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v23.4s, v23.4s, v1.4s\n"
+    "srshl v24.4s, v24.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "srshl v27.4s, v27.4s, v1.4s\n"
+    "srshl v28.4s, v28.4s, v1.4s\n"
+    "srshl v29.4s, v29.4s, v1.4s\n"
+    "srshl v30.4s, v30.4s, v1.4s\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "add v23.4s, v23.4s, v4.4s\n"
+    "add v24.4s, v24.4s, v4.4s\n"
+    "add v25.4s, v25.4s, v4.4s\n"
+    "add v26.4s, v26.4s, v4.4s\n"
+    "add v27.4s, v27.4s, v4.4s\n"
+    "add v28.4s, v28.4s, v4.4s\n"
+    "add v29.4s, v29.4s, v4.4s\n"
+    "add v30.4s, v30.4s, v4.4s\n"
+    "add v31.4s, v31.4s, v4.4s\n"
+    "smax v23.4s, v23.4s, v8.4s\n"
+    "smax v24.4s, v24.4s, v8.4s\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v27.4s, v27.4s, v8.4s\n"
+    "smax v28.4s, v28.4s, v8.4s\n"
+    "smax v29.4s, v29.4s, v8.4s\n"
+    "smax v30.4s, v30.4s, v8.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smin v23.4s, v23.4s, v7.4s\n"
+    "smin v24.4s, v24.4s, v7.4s\n"
+    "smin v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v7.4s\n"
+    "smin v27.4s, v27.4s, v7.4s\n"
+    "smin v28.4s, v28.4s, v7.4s\n"
+    "smin v29.4s, v29.4s, v7.4s\n"
+    "smin v30.4s, v30.4s, v7.4s\n"
+    "smin v31.4s, v31.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s23, [x28, x11]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s24, [x27, x11]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s25, [x26, x11]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s26, [x25, x11]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s27, [x24, x11]\n"
+    "str s28, [x23, x11]\n"
+    "str s29, [x22, x11]\n"
+    "str s30, [x21, x11]\n"
+    "str s31, [x20, x11]\n"
+    "add x11, x11, #0x4\n"
+    "cmp x11, x9, LSL #2\n"
+    "blt 1b\n"
+    "6:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 24f\n"
+    "movi v23.4s, #0x0\n"
+    "cbz %x[bias], 9f\n"
+    "add x20, %x[bias], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ld1 { v23.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v23.s }[2], [x20], #0x4\n"
+    "b 8f\n"
+    "7:"  // Oddments: Load bias: Bit 1: Unset
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "8:"  // Oddments: Load bias: Bit 1: End
+    "9:"  // Oddments: Load bias: Done
+    "ldr s0, [%x[params]], #0x4\n"
+    "mov x10, %x[inptrs]\n"
+    "ldp x9, x28, [x10], #0x10\n"
+    "mov v24.16b, v23.16b\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "mov v25.16b, v23.16b\n"
+    "mov v26.16b, v23.16b\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "ldr x21, [x10], #0x8\n"
+    "mov v27.16b, v23.16b\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "mov v30.16b, v23.16b\n"
+    "add x9, x9, x11\n"
+    "add x28, x28, x11\n"
+    "mov v31.16b, v23.16b\n"
+    "usubl v0.8h, v0.8b, v5.8b\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ldr h14, [x9], #0x2\n"
+    "ldr h15, [x28], #0x2\n"
+    "ldr h16, [x27], #0x2\n"
+    "ldr h17, [x26], #0x2\n"
+    "ldr h18, [x25], #0x2\n"
+    "ldr h19, [x24], #0x2\n"
+    "ldr h20, [x23], #0x2\n"
+    "ldr h21, [x22], #0x2\n"
+    "ldr h22, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v14.b }[2], [x9], #0x1\n"
+    "ld1 { v15.b }[2], [x28], #0x1\n"
+    "ld1 { v16.b }[2], [x27], #0x1\n"
+    "ld1 { v17.b }[2], [x26], #0x1\n"
+    "ld1 { v18.b }[2], [x25], #0x1\n"
+    "ld1 { v19.b }[2], [x24], #0x1\n"
+    "ld1 { v20.b }[2], [x23], #0x1\n"
+    "ld1 { v21.b }[2], [x22], #0x1\n"
+    "ld1 { v22.b }[2], [x21], #0x1\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load: Bit 1: Unset
+    "ldr b14, [x9], #0x1\n"
+    "ldr b15, [x28], #0x1\n"
+    "ldr b16, [x27], #0x1\n"
+    "ldr b17, [x26], #0x1\n"
+    "ldr b18, [x25], #0x1\n"
+    "ldr b19, [x24], #0x1\n"
+    "ldr b20, [x23], #0x1\n"
+    "ldr b21, [x22], #0x1\n"
+    "ldr b22, [x21], #0x1\n"
+    "11:"  // Oddments: Load: Bit 1: End
+    "subs x20, %x[n_points], #0x1\n"
+    "usubl v14.8h, v14.8b, v6.8b\n"
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "usubl v17.8h, v17.8b, v6.8b\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "usubl v19.8h, v19.8b, v6.8b\n"
+    "usubl v20.8h, v20.8b, v6.8b\n"
+    "usubl v21.8h, v21.8b, v6.8b\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "ble 15f\n"
+    "12:"  // Oddments: Planar loop
+    "ldp x9, x28, [x10], #0x10\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "ldr x21, [x10], #0x8\n"
+    "add x9, x9, x11\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "add x28, x28, x11\n"
+    "add x27, x27, x11\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "ldr s0, [%x[params]], #0x4\n"
+    "usubl v0.8h, v0.8b, v5.8b\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr h14, [x9], #0x2\n"
+    "ldr h15, [x28], #0x2\n"
+    "ldr h16, [x27], #0x2\n"
+    "ldr h17, [x26], #0x2\n"
+    "ldr h18, [x25], #0x2\n"
+    "ldr h19, [x24], #0x2\n"
+    "ldr h20, [x23], #0x2\n"
+    "ldr h21, [x22], #0x2\n"
+    "ldr h22, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v14.b }[2], [x9], #0x1\n"
+    "ld1 { v15.b }[2], [x28], #0x1\n"
+    "ld1 { v16.b }[2], [x27], #0x1\n"
+    "ld1 { v17.b }[2], [x26], #0x1\n"
+    "ld1 { v18.b }[2], [x25], #0x1\n"
+    "ld1 { v19.b }[2], [x24], #0x1\n"
+    "ld1 { v20.b }[2], [x23], #0x1\n"
+    "ld1 { v21.b }[2], [x22], #0x1\n"
+    "ld1 { v22.b }[2], [x21], #0x1\n"
+    "b 14f\n"
+    "13:"  // Oddments: Planar loop: Load: Bit 1: Unset
+    "ldr b14, [x9], #0x1\n"
+    "ldr b15, [x28], #0x1\n"
+    "ldr b16, [x27], #0x1\n"
+    "ldr b17, [x26], #0x1\n"
+    "ldr b18, [x25], #0x1\n"
+    "ldr b19, [x24], #0x1\n"
+    "ldr b20, [x23], #0x1\n"
+    "ldr b21, [x22], #0x1\n"
+    "ldr b22, [x21], #0x1\n"
+    "14:"  // Oddments: Planar loop: Load: Bit 1: End
+    "subs x20, x20, #0x1\n"
+    "usubl v14.8h, v14.8b, v6.8b\n"
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "usubl v17.8h, v17.8b, v6.8b\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "usubl v19.8h, v19.8b, v6.8b\n"
+    "usubl v20.8h, v20.8b, v6.8b\n"
+    "usubl v21.8h, v21.8b, v6.8b\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "bgt 12b\n"
+    "15:"  // Oddments: Planar tail
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "cbz %x[rq_mul_ptr], 21f\n"
+    "add x22, %x[rq_mul_ptr], x11, LSL #2\n"
+    "add x21, %x[rq_right_shift_ptr], x11, LSL #2\n"
+    "add x20, %x[rq_left_shift_ptr], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v2.d }[0], [x22], #0x8\n"
+    "ld1 { v1.d }[0], [x21], #0x8\n"
+    "cbz %x[rq_left_shift_ptr], 16f\n"
+    "ld1 { v3.d }[0], [x20], #0x8\n"
+    "16:"  // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v2.s }[2], [x22], #0x4\n"
+    "ld1 { v1.s }[2], [x21], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 17f\n"
+    "ld1 { v3.s }[2], [x20], #0x4\n"
+    "17:"  // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
+    "b 20f\n"
+    "18:"  // Oddments: Load quantisation parameters: Bit 1: Unset
+    "ld1 { v2.s }[0], [x22], #0x4\n"
+    "ld1 { v1.s }[0], [x21], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 19f\n"
+    "ld1 { v3.s }[0], [x20], #0x4\n"
+    "19:"  // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
+    "20:"  // Oddments: Load quantisation parameters: Bit 1: End
+    "21:"  // Oddments: Load quantisation parameters: Done
+    "sshl v23.4s, v23.4s, v3.4s\n"
+    "sshl v24.4s, v24.4s, v3.4s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "sshl v25.4s, v25.4s, v3.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "add x28, x28, x11\n"
+    "and v18.16b, v23.16b, v1.16b\n"
+    "and v17.16b, v24.16b, v1.16b\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "and v16.16b, v25.16b, v1.16b\n"
+    "sshl v26.4s, v26.4s, v3.4s\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "sshl v27.4s, v27.4s, v3.4s\n"
+    "sshl v28.4s, v28.4s, v3.4s\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "sshl v29.4s, v29.4s, v3.4s\n"
+    "sshl v30.4s, v30.4s, v3.4s\n"
+    "add x21, x21, x11\n"
+    "add x20, x20, x11\n"
+    "sshl v31.4s, v31.4s, v3.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+    "sqadd v23.4s, v23.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v21.16b, v26.16b, v1.16b\n"
+    "and v20.16b, v27.16b, v1.16b\n"
+    "and v19.16b, v28.16b, v1.16b\n"
+    "and v18.16b, v29.16b, v1.16b\n"
+    "and v17.16b, v30.16b, v1.16b\n"
+    "and v16.16b, v31.16b, v1.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v21.4s\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v23.4s, v23.4s, v1.4s\n"
+    "srshl v24.4s, v24.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "srshl v27.4s, v27.4s, v1.4s\n"
+    "srshl v28.4s, v28.4s, v1.4s\n"
+    "srshl v29.4s, v29.4s, v1.4s\n"
+    "srshl v30.4s, v30.4s, v1.4s\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "add v23.4s, v23.4s, v4.4s\n"
+    "add v24.4s, v24.4s, v4.4s\n"
+    "add v25.4s, v25.4s, v4.4s\n"
+    "add v26.4s, v26.4s, v4.4s\n"
+    "add v27.4s, v27.4s, v4.4s\n"
+    "add v28.4s, v28.4s, v4.4s\n"
+    "add v29.4s, v29.4s, v4.4s\n"
+    "add v30.4s, v30.4s, v4.4s\n"
+    "add v31.4s, v31.4s, v4.4s\n"
+    "smax v23.4s, v23.4s, v8.4s\n"
+    "smax v24.4s, v24.4s, v8.4s\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v27.4s, v27.4s, v8.4s\n"
+    "smax v28.4s, v28.4s, v8.4s\n"
+    "smax v29.4s, v29.4s, v8.4s\n"
+    "smax v30.4s, v30.4s, v8.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smin v23.4s, v23.4s, v7.4s\n"
+    "smin v24.4s, v24.4s, v7.4s\n"
+    "smin v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v7.4s\n"
+    "smin v27.4s, v27.4s, v7.4s\n"
+    "smin v28.4s, v28.4s, v7.4s\n"
+    "smin v29.4s, v29.4s, v7.4s\n"
+    "smin v30.4s, v30.4s, v7.4s\n"
+    "smin v31.4s, v31.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "st1 { v23.h }[0], [x28], #0x2\n"
+    "st1 { v24.h }[0], [x27], #0x2\n"
+    "st1 { v25.h }[0], [x26], #0x2\n"
+    "st1 { v26.h }[0], [x25], #0x2\n"
+    "st1 { v27.h }[0], [x24], #0x2\n"
+    "st1 { v28.h }[0], [x23], #0x2\n"
+    "st1 { v29.h }[0], [x22], #0x2\n"
+    "st1 { v30.h }[0], [x21], #0x2\n"
+    "st1 { v31.h }[0], [x20], #0x2\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "st1 { v23.b }[2], [x28], #0x1\n"
+    "st1 { v24.b }[2], [x27], #0x1\n"
+    "st1 { v25.b }[2], [x26], #0x1\n"
+    "st1 { v26.b }[2], [x25], #0x1\n"
+    "st1 { v27.b }[2], [x24], #0x1\n"
+    "st1 { v28.b }[2], [x23], #0x1\n"
+    "st1 { v29.b }[2], [x22], #0x1\n"
+    "st1 { v30.b }[2], [x21], #0x1\n"
+    "st1 { v31.b }[2], [x20], #0x1\n"
+    "b 23f\n"
+    "22:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v23.b }[0], [x28], #0x1\n"
+    "st1 { v24.b }[0], [x27], #0x1\n"
+    "st1 { v25.b }[0], [x26], #0x1\n"
+    "st1 { v26.b }[0], [x25], #0x1\n"
+    "st1 { v27.b }[0], [x24], #0x1\n"
+    "st1 { v28.b }[0], [x23], #0x1\n"
+    "st1 { v29.b }[0], [x22], #0x1\n"
+    "st1 { v30.b }[0], [x21], #0x1\n"
+    "st1 { v31.b }[0], [x20], #0x1\n"
+    "23:"  // Oddments: Store: Bit 1: End
+    "24:"  // End
+    : [params] "+&r" (params)
+    : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000000..76965606f7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *)
+  : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d69f391514
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -0,0 +1,519 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "ldr q11, [%x[params], #0x0]\n"
+    "ldr q5, [%x[params], #0x10]\n"
+    "movi v8.16b, #0x1\n"
+    "ushr v8.4s, v8.4s, #0x8\n"
+    "ldr q6, [%x[params], #0x20]\n"
+    "ldr q7, [%x[params], #0x30]\n"
+    "movi v24.4s, #0x0\n"
+    "movi v25.4s, #0x0\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "ld1 { v1.16b }, [x20]\n"
+    "mov v28.16b, v1.16b\n"
+    "mov v23.16b, v1.16b\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "ld1 { v2.16b }, [x20]\n"
+    "mov v30.16b, v1.16b\n"
+    "mov v21.16b, v2.16b\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "ld1 { v4.16b }, [x20]\n"
+    "mov v20.16b, v2.16b\n"
+    "mov v29.16b, v2.16b\n"
+    "ldr x20, [%x[inptrs], #0x0]\n"
+    "ld1 { v0.16b }, [x20]\n"
+    "mov v9.16b, v4.16b\n"
+    "mov v22.16b, v4.16b\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "ld1 { v3.16b }, [x20]\n"
+    "mov v31.16b, v4.16b\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x2\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x4\n"
+    "ext v30.16b, v30.16b, v30.16b, #0x6\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v12.4s }, [x20]\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x2\n"
+    "ext v20.16b, v20.16b, v20.16b, #0x4\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "ext v29.16b, v29.16b, v29.16b, #0x6\n"
+    "ext v9.16b, v9.16b, v9.16b, #0x2\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v13.4s }, [x20]\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x4\n"
+    "ext v31.16b, v31.16b, v31.16b, #0x6\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "mov v27.16b, v0.16b\n"
+    "mov v19.16b, v0.16b\n"
+    "cmp %x[n_channels], #0x4\n"
+    "mov x9, #0x0\n"
+    "mov v18.16b, v0.16b\n"
+    "mov v26.16b, v3.16b\n"
+    "mov x28, #0x0\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "mov v17.16b, v3.16b\n"
+    "mov v16.16b, v3.16b\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x2\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x4\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "add %x[params], %x[params], #0x40\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x6\n"
+    "zip1 v1.4s, v1.4s, v23.4s\n"
+    "zip1 v28.4s, v28.4s, v30.4s\n"
+    "zip1 v2.4s, v2.4s, v20.4s\n"
+    "zip1 v21.4s, v21.4s, v29.4s\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x2\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+    "ext v16.16b, v16.16b, v16.16b, #0x6\n"
+    "zip1 v4.4s, v4.4s, v22.4s\n"
+    "zip1 v9.4s, v9.4s, v31.4s\n"
+    "zip1 v0.4s, v0.4s, v19.4s\n"
+    "zip1 v27.4s, v27.4s, v18.4s\n"
+    "zip1 v1.4s, v1.4s, v28.4s\n"
+    "zip1 v2.4s, v2.4s, v21.4s\n"
+    ".inst 0x6f81e118  // udot v24.4s, v8.16b, v1.4b[0]\n"
+    "zip1 v3.4s, v3.4s, v17.4s\n"
+    "zip1 v26.4s, v26.4s, v16.4s\n"
+    ".inst 0x6fa1e119  // udot v25.4s, v8.16b, v1.4b[1]\n"
+    "zip1 v4.4s, v4.4s, v9.4s\n"
+    "movi v23.4s, #0x0\n"
+    ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
+    "movi v22.4s, #0x0\n"
+    "movi v21.4s, #0x0\n"
+    ".inst 0x6fa1e916  // udot v22.4s, v8.16b, v1.4b[3]\n"
+    "movi v19.4s, #0x0\n"
+    "movi v9.4s, #0x0\n"
+    ".inst 0x6f82e115  // udot v21.4s, v8.16b, v2.4b[0]\n"
+    "movi v10.4s, #0x0\n"
+    "movi v20.4s, #0x0\n"
+    ".inst 0x6fa2e113  // udot v19.4s, v8.16b, v2.4b[1]\n"
+    "movi v18.4s, #0x0\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x6f82e909  // udot v9.4s, v8.16b, v2.4b[2]\n"
+    "movi v16.4s, #0x0\n"
+    "zip1 v0.4s, v0.4s, v27.4s\n"
+    ".inst 0x6fa2e90a  // udot v10.4s, v8.16b, v2.4b[3]\n"
+    "zip1 v3.4s, v3.4s, v26.4s\n"
+    ".inst 0x6f84e114  // udot v20.4s, v8.16b, v4.4b[0]\n"
+    ".inst 0x6fa4e112  // udot v18.4s, v8.16b, v4.4b[1]\n"
+    ".inst 0x6f84e911  // udot v17.4s, v8.16b, v4.4b[2]\n"
+    ".inst 0x6fa4e910  // udot v16.4s, v8.16b, v4.4b[3]\n"
+    "movi v31.4s, #0x0\n"
+    "movi v30.4s, #0x0\n"
+    "movi v26.4s, #0x0\n"
+    ".inst 0x6f80e11f  // udot v31.4s, v8.16b, v0.4b[0]\n"
+    "movi v27.4s, #0x0\n"
+    "movi v28.4s, #0x0\n"
+    ".inst 0x6fa0e11e  // udot v30.4s, v8.16b, v0.4b[1]\n"
+    "movi v29.4s, #0x0\n"
+    ".inst 0x6f80e91a  // udot v26.4s, v8.16b, v0.4b[2]\n"
+    ".inst 0x6fa0e91b  // udot v27.4s, v8.16b, v0.4b[3]\n"
+    ".inst 0x6f83e11c  // udot v28.4s, v8.16b, v3.4b[0]\n"
+    ".inst 0x6fa3e11d  // udot v29.4s, v8.16b, v3.4b[1]\n"
+    "add v24.4s, v24.4s, v21.4s\n"
+    "add v25.4s, v25.4s, v19.4s\n"
+    "add v23.4s, v23.4s, v9.4s\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "add v21.4s, v20.4s, v21.4s\n"
+    "movi v20.4s, #0x0\n"
+    ".inst 0x6f83e914  // udot v20.4s, v8.16b, v3.4b[2]\n"
+    "add v19.4s, v18.4s, v19.4s\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x6fa3e912  // udot v18.4s, v8.16b, v3.4b[3]\n"
+    "add v17.4s, v17.4s, v9.4s\n"
+    "add v16.4s, v16.4s, v10.4s\n"
+    "add v24.4s, v24.4s, v31.4s\n"
+    "add v25.4s, v25.4s, v30.4s\n"
+    "add v26.4s, v23.4s, v26.4s\n"
+    "add v27.4s, v22.4s, v27.4s\n"
+    "add v28.4s, v21.4s, v28.4s\n"
+    "add v29.4s, v19.4s, v29.4s\n"
+    "add v30.4s, v17.4s, v20.4s\n"
+    "add v31.4s, v16.4s, v18.4s\n"
+    "neg v12.4s, v12.4s\n"
+    "mul v24.4s, v24.4s, v12.4s\n"
+    "mul v25.4s, v25.4s, v12.4s\n"
+    "mul v26.4s, v26.4s, v12.4s\n"
+    "mul v27.4s, v27.4s, v12.4s\n"
+    "mul v28.4s, v28.4s, v12.4s\n"
+    "mul v29.4s, v29.4s, v12.4s\n"
+    "mul v30.4s, v30.4s, v12.4s\n"
+    "mul v31.4s, v31.4s, v12.4s\n"
+    "zip1 v19.4s, v24.4s, v26.4s\n"
+    "zip1 v18.4s, v25.4s, v27.4s\n"
+    "zip1 v17.4s, v28.4s, v30.4s\n"
+    "zip1 v16.4s, v29.4s, v31.4s\n"
+    "zip1 v22.4s, v19.4s, v18.4s\n"
+    "zip1 v23.4s, v17.4s, v16.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "ble 2f\n"
+    "1:"  // Loop
+    "ldr q8, [%x[params], #0x0]\n"
+    "ldr q21, [%x[params], #0x10]\n"
+    ".inst 0x6f80e0b8  // udot v24.4s, v5.16b, v0.4b[0]\n"
+    ".inst 0x6fa0e0b9  // udot v25.4s, v5.16b, v0.4b[1]\n"
+    "ldr q20, [%x[params], #0x20]\n"
+    ".inst 0x6f80e8ba  // udot v26.4s, v5.16b, v0.4b[2]\n"
+    ".inst 0x6fa0e8bb  // udot v27.4s, v5.16b, v0.4b[3]\n"
+    "sub %x[n_channels], %x[n_channels], #0x4\n"
+    ".inst 0x6f81e0d8  // udot v24.4s, v6.16b, v1.4b[0]\n"
+    ".inst 0x6fa1e0d9  // udot v25.4s, v6.16b, v1.4b[1]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "add x9, x9, #0x10\n"
+    ".inst 0x6f81e8da  // udot v26.4s, v6.16b, v1.4b[2]\n"
+    ".inst 0x6fa1e8db  // udot v27.4s, v6.16b, v1.4b[3]\n"
+    ".inst 0x6f82e0bc  // udot v28.4s, v5.16b, v2.4b[0]\n"
+    ".inst 0x6fa2e0bd  // udot v29.4s, v5.16b, v2.4b[1]\n"
+    ".inst 0x6f82e8be  // udot v30.4s, v5.16b, v2.4b[2]\n"
+    ".inst 0x6fa2e8bf  // udot v31.4s, v5.16b, v2.4b[3]\n"
+    "ldr q5, [%x[params], #0x30]\n"
+    ".inst 0x6f82e0f8  // udot v24.4s, v7.16b, v2.4b[0]\n"
+    ".inst 0x6fa2e0f9  // udot v25.4s, v7.16b, v2.4b[1]\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    ".inst 0x6f82e8fa  // udot v26.4s, v7.16b, v2.4b[2]\n"
+    ".inst 0x6fa2e8fb  // udot v27.4s, v7.16b, v2.4b[3]\n"
+    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+    ".inst 0x6f83e0dc  // udot v28.4s, v6.16b, v3.4b[0]\n"
+    ".inst 0x6fa3e0dd  // udot v29.4s, v6.16b, v3.4b[1]\n"
+    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+    ".inst 0x6f83e8de  // udot v30.4s, v6.16b, v3.4b[2]\n"
+    ".inst 0x6fa3e8df  // udot v31.4s, v6.16b, v3.4b[3]\n"
+    "ldr q6, [%x[params], #0x40]\n"
+    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+    ".inst 0x6f84e0fc  // udot v28.4s, v7.16b, v4.4b[0]\n"
+    ".inst 0x6fa4e0fd  // udot v29.4s, v7.16b, v4.4b[1]\n"
+    "and v19.16b, v24.16b, v21.16b\n"
+    ".inst 0x6f84e8fe  // udot v30.4s, v7.16b, v4.4b[2]\n"
+    ".inst 0x6fa4e8ff  // udot v31.4s, v7.16b, v4.4b[3]\n"
+    "ldr q7, [%x[params], #0x50]\n"
+    "and v18.16b, v25.16b, v21.16b\n"
+    "and v17.16b, v26.16b, v21.16b\n"
+    "and v16.16b, v27.16b, v21.16b\n"
+    "add %x[params], %x[params], #0x60\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+    "sqadd v24.4s, v24.4s, v19.4s\n"
+    "sqadd v25.4s, v25.4s, v18.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "and v19.16b, v28.16b, v21.16b\n"
+    "and v18.16b, v29.16b, v21.16b\n"
+    "and v17.16b, v30.16b, v21.16b\n"
+    "and v16.16b, v31.16b, v21.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v24.4s, v24.4s, v21.4s\n"
+    "srshl v25.4s, v25.4s, v21.4s\n"
+    "srshl v26.4s, v26.4s, v21.4s\n"
+    "srshl v27.4s, v27.4s, v21.4s\n"
+    "srshl v28.4s, v28.4s, v21.4s\n"
+    "srshl v29.4s, v29.4s, v21.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v31.4s, v31.4s, v21.4s\n"
+    "add v24.4s, v24.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "add v27.4s, v27.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "smin v24.4s, v24.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v15.4s\n"
+    "smin v26.4s, v26.4s, v15.4s\n"
+    "smin v27.4s, v27.4s, v15.4s\n"
+    "smin v28.4s, v28.4s, v15.4s\n"
+    "smin v29.4s, v29.4s, v15.4s\n"
+    "smin v30.4s, v30.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v13.4s\n"
+    "smax v25.4s, v25.4s, v13.4s\n"
+    "smax v26.4s, v26.4s, v13.4s\n"
+    "smax v27.4s, v27.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s24, [x27, x28]\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s25, [x26, x28]\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s26, [x25, x28]\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s27, [x24, x28]\n"
+    "str s28, [x23, x28]\n"
+    "dup v24.4s, v22.s[0]\n"
+    "dup v25.4s, v22.s[1]\n"
+    "str s29, [x22, x28]\n"
+    "dup v26.4s, v22.s[2]\n"
+    "dup v27.4s, v22.s[3]\n"
+    "str s30, [x21, x28]\n"
+    "dup v28.4s, v23.s[0]\n"
+    "dup v29.4s, v23.s[1]\n"
+    "str s31, [x20, x28]\n"
+    "dup v30.4s, v23.s[2]\n"
+    "dup v31.4s, v23.s[3]\n"
+    "add x28, x28, #0x4\n"
+    "add v24.4s, v24.4s, v20.4s\n"
+    "add v25.4s, v25.4s, v20.4s\n"
+    "add v26.4s, v26.4s, v20.4s\n"
+    "add v27.4s, v27.4s, v20.4s\n"
+    "add v28.4s, v28.4s, v20.4s\n"
+    "add v29.4s, v29.4s, v20.4s\n"
+    "add v30.4s, v30.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v20.4s\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q21, [%x[params], #0x0]\n"
+    "ldr q20, [%x[params], #0x10]\n"
+    ".inst 0x6f80e0b8  // udot v24.4s, v5.16b, v0.4b[0]\n"
+    ".inst 0x6fa0e0b9  // udot v25.4s, v5.16b, v0.4b[1]\n"
+    ".inst 0x6f80e8ba  // udot v26.4s, v5.16b, v0.4b[2]\n"
+    ".inst 0x6fa0e8bb  // udot v27.4s, v5.16b, v0.4b[3]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "add x27, x27, x28\n"
+    ".inst 0x6f81e0d8  // udot v24.4s, v6.16b, v1.4b[0]\n"
+    ".inst 0x6fa1e0d9  // udot v25.4s, v6.16b, v1.4b[1]\n"
+    "add x26, x26, x28\n"
+    "add x25, x25, x28\n"
+    ".inst 0x6f81e8da  // udot v26.4s, v6.16b, v1.4b[2]\n"
+    ".inst 0x6fa1e8db  // udot v27.4s, v6.16b, v1.4b[3]\n"
+    "add x24, x24, x28\n"
+    "add x23, x23, x28\n"
+    ".inst 0x6f82e0bc  // udot v28.4s, v5.16b, v2.4b[0]\n"
+    ".inst 0x6fa2e0bd  // udot v29.4s, v5.16b, v2.4b[1]\n"
+    "add x22, x22, x28\n"
+    "add x21, x21, x28\n"
+    ".inst 0x6f82e8be  // udot v30.4s, v5.16b, v2.4b[2]\n"
+    ".inst 0x6fa2e8bf  // udot v31.4s, v5.16b, v2.4b[3]\n"
+    "add x20, x20, x28\n"
+    "add %x[params], %x[params], #0x20\n"
+    ".inst 0x6f82e0f8  // udot v24.4s, v7.16b, v2.4b[0]\n"
+    ".inst 0x6fa2e0f9  // udot v25.4s, v7.16b, v2.4b[1]\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    ".inst 0x6f82e8fa  // udot v26.4s, v7.16b, v2.4b[2]\n"
+    ".inst 0x6fa2e8fb  // udot v27.4s, v7.16b, v2.4b[3]\n"
+    "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+    ".inst 0x6f83e0dc  // udot v28.4s, v6.16b, v3.4b[0]\n"
+    ".inst 0x6fa3e0dd  // udot v29.4s, v6.16b, v3.4b[1]\n"
+    "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+    ".inst 0x6f83e8de  // udot v30.4s, v6.16b, v3.4b[2]\n"
+    ".inst 0x6fa3e8df  // udot v31.4s, v6.16b, v3.4b[3]\n"
+    "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+    ".inst 0x6f84e0fc  // udot v28.4s, v7.16b, v4.4b[0]\n"
+    ".inst 0x6fa4e0fd  // udot v29.4s, v7.16b, v4.4b[1]\n"
+    "and v19.16b, v24.16b, v20.16b\n"
+    ".inst 0x6f84e8fe  // udot v30.4s, v7.16b, v4.4b[2]\n"
+    ".inst 0x6fa4e8ff  // udot v31.4s, v7.16b, v4.4b[3]\n"
+    "and v18.16b, v25.16b, v20.16b\n"
+    "and v17.16b, v26.16b, v20.16b\n"
+    "and v16.16b, v27.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v19.4s\n"
+    "sqadd v25.4s, v25.4s, v18.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "and v19.16b, v28.16b, v20.16b\n"
+    "and v18.16b, v29.16b, v20.16b\n"
+    "and v17.16b, v30.16b, v20.16b\n"
+    "and v16.16b, v31.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v24.4s, v24.4s, v20.4s\n"
+    "srshl v25.4s, v25.4s, v20.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "srshl v27.4s, v27.4s, v20.4s\n"
+    "srshl v28.4s, v28.4s, v20.4s\n"
+    "srshl v29.4s, v29.4s, v20.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "add v24.4s, v24.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "add v27.4s, v27.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "smin v24.4s, v24.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v15.4s\n"
+    "smin v26.4s, v26.4s, v15.4s\n"
+    "smin v27.4s, v27.4s, v15.4s\n"
+    "smin v28.4s, v28.4s, v15.4s\n"
+    "smin v29.4s, v29.4s, v15.4s\n"
+    "smin v30.4s, v30.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v13.4s\n"
+    "smax v25.4s, v25.4s, v13.4s\n"
+    "smax v26.4s, v26.4s, v13.4s\n"
+    "smax v27.4s, v27.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "blt 3f\n"
+    "str s24, [x27, #0x0]\n"
+    "str s25, [x26, #0x0]\n"
+    "str s26, [x25, #0x0]\n"
+    "str s27, [x24, #0x0]\n"
+    "str s28, [x23, #0x0]\n"
+    "str s29, [x22, #0x0]\n"
+    "str s30, [x21, #0x0]\n"
+    "str s31, [x20, #0x0]\n"
+    "b 4f\n"
+    "3:"  // Tail: Oddments
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[0], [x27], #0x1\n"
+    "st1 { v25.b }[0], [x26], #0x1\n"
+    "st1 { v26.b }[0], [x25], #0x1\n"
+    "st1 { v27.b }[0], [x24], #0x1\n"
+    "st1 { v28.b }[0], [x23], #0x1\n"
+    "st1 { v29.b }[0], [x22], #0x1\n"
+    "st1 { v30.b }[0], [x21], #0x1\n"
+    "st1 { v31.b }[0], [x20], #0x1\n"
+    "beq 4f\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[1], [x27], #0x1\n"
+    "st1 { v25.b }[1], [x26], #0x1\n"
+    "st1 { v26.b }[1], [x25], #0x1\n"
+    "st1 { v27.b }[1], [x24], #0x1\n"
+    "st1 { v28.b }[1], [x23], #0x1\n"
+    "st1 { v29.b }[1], [x22], #0x1\n"
+    "st1 { v30.b }[1], [x21], #0x1\n"
+    "st1 { v31.b }[1], [x20], #0x1\n"
+    "beq 4f\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[2], [x27], #0x1\n"
+    "st1 { v25.b }[2], [x26], #0x1\n"
+    "st1 { v26.b }[2], [x25], #0x1\n"
+    "st1 { v27.b }[2], [x24], #0x1\n"
+    "st1 { v28.b }[2], [x23], #0x1\n"
+    "st1 { v29.b }[2], [x22], #0x1\n"
+    "st1 { v30.b }[2], [x21], #0x1\n"
+    "st1 { v31.b }[2], [x20], #0x1\n"
+    "beq 4f\n"
+    "st1 { v24.b }[3], [x27], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v25.b }[3], [x26], #0x1\n"
+    "st1 { v26.b }[3], [x25], #0x1\n"
+    "st1 { v27.b }[3], [x24], #0x1\n"
+    "st1 { v28.b }[3], [x23], #0x1\n"
+    "st1 { v29.b }[3], [x22], #0x1\n"
+    "st1 { v30.b }[3], [x21], #0x1\n"
+    "st1 { v31.b }[3], [x20], #0x1\n"
+    "4:"  // Tail: End
+    : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..4485aaa735
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *)
+  : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..61cec2b66d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,640 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "ldr q12, [%x[params], #0x0]\n"
+    "ldr q8, [%x[params], #0x10]\n"
+    "movi v30.16b, #0x1\n"
+    "movi v17.4s, #0x0\n"
+    "ldr q9, [%x[params], #0x20]\n"
+    "ldr q10, [%x[params], #0x30]\n"
+    "movi v16.4s, #0x0\n"
+    "movi v25.4s, #0x0\n"
+    "ldr q11, [%x[params], #0x40]\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "movi v24.4s, #0x0\n"
+    "movi v31.4s, #0x0\n"
+    "ld1 { v3.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "mov v26.16b, v3.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "ld1 { v4.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "mov v21.16b, v4.16b\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+    "ld1 { v2.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "mov v27.16b, v2.16b\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    "ld1 { v1.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x28]\n"
+    "zip1 v3.2d, v3.2d, v26.2d\n"
+    "zip1 v4.2d, v4.2d, v21.2d\n"
+    "ld1 { v5.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x30]\n"
+    "mov v26.16b, v1.16b\n"
+    "mov v22.16b, v5.16b\n"
+    "ld1 { v6.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x38]\n"
+    "mov v19.16b, v6.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "ld1 { v7.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x0]\n"
+    "mov v21.16b, v7.16b\n"
+    "zip1 v2.2d, v2.2d, v27.2d\n"
+    "ld1 { v0.16b }, [x20]\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+    ".inst 0x6f83e3d1  // udot v17.4s, v30.16b, v3.4b[0]\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+    ".inst 0x6f83ebd0  // udot v16.4s, v30.16b, v3.4b[2]\n"
+    ".inst 0x6f84e3d9  // udot v25.4s, v30.16b, v4.4b[0]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v23.4s }, [x20]\n"
+    ".inst 0x6f84ebd8  // udot v24.4s, v30.16b, v4.4b[2]\n"
+    "mov v18.16b, v0.16b\n"
+    ".inst 0x6f82e3df  // udot v31.4s, v30.16b, v2.4b[0]\n"
+    "movi v29.4s, #0x0\n"
+    "movi v28.4s, #0x1\n"
+    ".inst 0x6f82ebdd  // udot v29.4s, v30.16b, v2.4b[2]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v13.4s }, [x20]\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+    "zip1 v1.2d, v1.2d, v26.2d\n"
+    ".inst 0x6fa3e391  // udot v17.4s, v28.16b, v3.4b[1]\n"
+    "zip1 v5.2d, v5.2d, v22.2d\n"
+    "zip1 v6.2d, v6.2d, v19.2d\n"
+    ".inst 0x6fa3eb90  // udot v16.4s, v28.16b, v3.4b[3]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "zip1 v7.2d, v7.2d, v21.2d\n"
+    "movi v22.4s, #0x0\n"
+    ".inst 0x6fa4e399  // udot v25.4s, v28.16b, v4.4b[1]\n"
+    "movi v21.4s, #0x0\n"
+    ".inst 0x6fa4eb98  // udot v24.4s, v28.16b, v4.4b[3]\n"
+    ".inst 0x6f81e3d6  // udot v22.4s, v30.16b, v1.4b[0]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "movi v26.4s, #0x0\n"
+    "movi v27.4s, #0x0\n"
+    ".inst 0x6f81ebd5  // udot v21.4s, v30.16b, v1.4b[2]\n"
+    "movi v20.4s, #0x0\n"
+    "movi v19.4s, #0x0\n"
+    ".inst 0x6f85e3da  // udot v26.4s, v30.16b, v5.4b[0]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "zip1 v0.2d, v0.2d, v18.2d\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x6f85ebdb  // udot v27.4s, v30.16b, v5.4b[2]\n"
+    "mov x9, #0x0\n"
+    ".inst 0x6f86e3d4  // udot v20.4s, v30.16b, v6.4b[0]\n"
+    ".inst 0x6f86ebd3  // udot v19.4s, v30.16b, v6.4b[2]\n"
+    "add v17.4s, v17.4s, v25.4s\n"
+    "mov x28, #0x0\n"
+    "movi v25.4s, #0x0\n"
+    ".inst 0x6f87e3d2  // udot v18.4s, v30.16b, v7.4b[0]\n"
+    ".inst 0x6f87ebd9  // udot v25.4s, v30.16b, v7.4b[2]\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    ".inst 0x6fa2e39f  // udot v31.4s, v28.16b, v2.4b[1]\n"
+    ".inst 0x6fa2eb9d  // udot v29.4s, v28.16b, v2.4b[3]\n"
+    "add v16.4s, v16.4s, v24.4s\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "movi v24.4s, #0x0\n"
+    ".inst 0x6f80e3d8  // udot v24.4s, v30.16b, v0.4b[0]\n"
+    ".inst 0x6fa1e396  // udot v22.4s, v28.16b, v1.4b[1]\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    ".inst 0x6fa1eb95  // udot v21.4s, v28.16b, v1.4b[3]\n"
+    ".inst 0x6fa5e39a  // udot v26.4s, v28.16b, v5.4b[1]\n"
+    "add v31.4s, v31.4s, v17.4s\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    ".inst 0x6fa5eb9b  // udot v27.4s, v28.16b, v5.4b[3]\n"
+    ".inst 0x6fa6e394  // udot v20.4s, v28.16b, v6.4b[1]\n"
+    "add v29.4s, v29.4s, v16.4s\n"
+    "add %x[params], %x[params], #0x50\n"
+    ".inst 0x6fa6eb93  // udot v19.4s, v28.16b, v6.4b[3]\n"
+    ".inst 0x6fa7e392  // udot v18.4s, v28.16b, v7.4b[1]\n"
+    "add v22.4s, v22.4s, v31.4s\n"
+    ".inst 0x6fa7eb99  // udot v25.4s, v28.16b, v7.4b[3]\n"
+    ".inst 0x6fa0e398  // udot v24.4s, v28.16b, v0.4b[1]\n"
+    "add v21.4s, v21.4s, v29.4s\n"
+    "add v20.4s, v26.4s, v20.4s\n"
+    "add v19.4s, v27.4s, v19.4s\n"
+    "add v18.4s, v18.4s, v17.4s\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x6f80ebd1  // udot v17.4s, v30.16b, v0.4b[2]\n"
+    ".inst 0x6fa0eb91  // udot v17.4s, v28.16b, v0.4b[3]\n"
+    "add v16.4s, v25.4s, v16.4s\n"
+    "add v24.4s, v22.4s, v24.4s\n"
+    "add v25.4s, v21.4s, v17.4s\n"
+    "add v26.4s, v26.4s, v22.4s\n"
+    "add v27.4s, v27.4s, v21.4s\n"
+    "add v28.4s, v20.4s, v31.4s\n"
+    "add v29.4s, v19.4s, v29.4s\n"
+    "add v30.4s, v20.4s, v18.4s\n"
+    "add v31.4s, v19.4s, v16.4s\n"
+    "neg v23.4s, v23.4s\n"
+    "mul v24.4s, v24.4s, v23.4s\n"
+    "mul v25.4s, v25.4s, v23.4s\n"
+    "mul v26.4s, v26.4s, v23.4s\n"
+    "mul v27.4s, v27.4s, v23.4s\n"
+    "mul v28.4s, v28.4s, v23.4s\n"
+    "mul v29.4s, v29.4s, v23.4s\n"
+    "mul v30.4s, v30.4s, v23.4s\n"
+    "mul v31.4s, v31.4s, v23.4s\n"
+    "zip1 v19.4s, v24.4s, v26.4s\n"
+    "zip1 v18.4s, v25.4s, v27.4s\n"
+    "zip1 v17.4s, v28.4s, v30.4s\n"
+    "zip1 v16.4s, v29.4s, v31.4s\n"
+    "zip1 v22.4s, v19.4s, v18.4s\n"
+    "zip1 v23.4s, v17.4s, v16.4s\n"
+    "add v24.4s, v24.4s, v12.4s\n"
+    "add v25.4s, v25.4s, v12.4s\n"
+    "add v26.4s, v26.4s, v12.4s\n"
+    "add v27.4s, v27.4s, v12.4s\n"
+    "add v28.4s, v28.4s, v12.4s\n"
+    "add v29.4s, v29.4s, v12.4s\n"
+    "add v30.4s, v30.4s, v12.4s\n"
+    "add v31.4s, v31.4s, v12.4s\n"
+    "ble 2f\n"
+    "1:"  // Loop
+    "ldr q12, [%x[params], #0x60]\n"
+    "ldr q21, [%x[params], #0x70]\n"
+    ".inst 0x6f80e118  // udot v24.4s, v8.16b, v0.4b[0]\n"
+    ".inst 0x6f80e919  // udot v25.4s, v8.16b, v0.4b[2]\n"
+    "ldr q20, [%x[params], #0x80]\n"
+    ".inst 0x6f81e11a  // udot v26.4s, v8.16b, v1.4b[0]\n"
+    ".inst 0x6f81e91b  // udot v27.4s, v8.16b, v1.4b[2]\n"
+    "sub %x[n_channels], %x[n_channels], #0x4\n"
+    ".inst 0x6fa0e138  // udot v24.4s, v9.16b, v0.4b[1]\n"
+    ".inst 0x6fa0e939  // udot v25.4s, v9.16b, v0.4b[3]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "add x9, x9, #0x10\n"
+    ".inst 0x6fa1e13a  // udot v26.4s, v9.16b, v1.4b[1]\n"
+    ".inst 0x6fa1e93b  // udot v27.4s, v9.16b, v1.4b[3]\n"
+    ".inst 0x6f82e11c  // udot v28.4s, v8.16b, v2.4b[0]\n"
+    ".inst 0x6f82e91d  // udot v29.4s, v8.16b, v2.4b[2]\n"
+    ".inst 0x6f83e11e  // udot v30.4s, v8.16b, v3.4b[0]\n"
+    ".inst 0x6f83e91f  // udot v31.4s, v8.16b, v3.4b[2]\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    ".inst 0x6f81e158  // udot v24.4s, v10.16b, v1.4b[0]\n"
+    ".inst 0x6f81e959  // udot v25.4s, v10.16b, v1.4b[2]\n"
+    ".inst 0x6f82e15a  // udot v26.4s, v10.16b, v2.4b[0]\n"
+    ".inst 0x6f82e95b  // udot v27.4s, v10.16b, v2.4b[2]\n"
+    ".inst 0x6fa2e13c  // udot v28.4s, v9.16b, v2.4b[1]\n"
+    ".inst 0x6fa2e93d  // udot v29.4s, v9.16b, v2.4b[3]\n"
+    ".inst 0x6fa3e13e  // udot v30.4s, v9.16b, v3.4b[1]\n"
+    ".inst 0x6fa3e93f  // udot v31.4s, v9.16b, v3.4b[3]\n"
+    "ldr q16, [%x[params], #0x10]\n"
+    ".inst 0x6fa1e178  // udot v24.4s, v11.16b, v1.4b[1]\n"
+    ".inst 0x6fa1e979  // udot v25.4s, v11.16b, v1.4b[3]\n"
+    ".inst 0x6fa2e17a  // udot v26.4s, v11.16b, v2.4b[1]\n"
+    ".inst 0x6fa2e97b  // udot v27.4s, v11.16b, v2.4b[3]\n"
+    ".inst 0x6f83e15c  // udot v28.4s, v10.16b, v3.4b[0]\n"
+    ".inst 0x6f83e95d  // udot v29.4s, v10.16b, v3.4b[2]\n"
+    ".inst 0x6f84e15e  // udot v30.4s, v10.16b, v4.4b[0]\n"
+    ".inst 0x6f84e95f  // udot v31.4s, v10.16b, v4.4b[2]\n"
+    "ldr q19, [%x[params], #0x20]\n"
+    ".inst 0x6f82e238  // udot v24.4s, v17.16b, v2.4b[0]\n"
+    ".inst 0x6f82ea39  // udot v25.4s, v17.16b, v2.4b[2]\n"
+    ".inst 0x6f83e23a  // udot v26.4s, v17.16b, v3.4b[0]\n"
+    ".inst 0x6f83ea3b  // udot v27.4s, v17.16b, v3.4b[2]\n"
+    ".inst 0x6fa3e17c  // udot v28.4s, v11.16b, v3.4b[1]\n"
+    ".inst 0x6fa3e97d  // udot v29.4s, v11.16b, v3.4b[3]\n"
+    ".inst 0x6fa4e17e  // udot v30.4s, v11.16b, v4.4b[1]\n"
+    ".inst 0x6fa4e97f  // udot v31.4s, v11.16b, v4.4b[3]\n"
+    "ldr q18, [%x[params], #0x30]\n"
+    ".inst 0x6fa2e218  // udot v24.4s, v16.16b, v2.4b[1]\n"
+    ".inst 0x6fa2ea19  // udot v25.4s, v16.16b, v2.4b[3]\n"
+    ".inst 0x6fa3e21a  // udot v26.4s, v16.16b, v3.4b[1]\n"
+    ".inst 0x6fa3ea1b  // udot v27.4s, v16.16b, v3.4b[3]\n"
+    ".inst 0x6f84e23c  // udot v28.4s, v17.16b, v4.4b[0]\n"
+    ".inst 0x6f84ea3d  // udot v29.4s, v17.16b, v4.4b[2]\n"
+    ".inst 0x6f85e23e  // udot v30.4s, v17.16b, v5.4b[0]\n"
+    ".inst 0x6f85ea3f  // udot v31.4s, v17.16b, v5.4b[2]\n"
+    "ldr q17, [%x[params], #0x40]\n"
+    ".inst 0x6f83e278  // udot v24.4s, v19.16b, v3.4b[0]\n"
+    ".inst 0x6f83ea79  // udot v25.4s, v19.16b, v3.4b[2]\n"
+    ".inst 0x6f84e27a  // udot v26.4s, v19.16b, v4.4b[0]\n"
+    ".inst 0x6f84ea7b  // udot v27.4s, v19.16b, v4.4b[2]\n"
+    ".inst 0x6fa4e21c  // udot v28.4s, v16.16b, v4.4b[1]\n"
+    ".inst 0x6fa4ea1d  // udot v29.4s, v16.16b, v4.4b[3]\n"
+    ".inst 0x6fa5e21e  // udot v30.4s, v16.16b, v5.4b[1]\n"
+    ".inst 0x6fa5ea1f  // udot v31.4s, v16.16b, v5.4b[3]\n"
+    "ldr q16, [%x[params], #0x50]\n"
+    ".inst 0x6fa3e258  // udot v24.4s, v18.16b, v3.4b[1]\n"
+    ".inst 0x6fa3ea59  // udot v25.4s, v18.16b, v3.4b[3]\n"
+    ".inst 0x6fa4e25a  // udot v26.4s, v18.16b, v4.4b[1]\n"
+    ".inst 0x6fa4ea5b  // udot v27.4s, v18.16b, v4.4b[3]\n"
+    ".inst 0x6f85e27c  // udot v28.4s, v19.16b, v5.4b[0]\n"
+    ".inst 0x6f85ea7d  // udot v29.4s, v19.16b, v5.4b[2]\n"
+    ".inst 0x6f86e27e  // udot v30.4s, v19.16b, v6.4b[0]\n"
+    ".inst 0x6f86ea7f  // udot v31.4s, v19.16b, v6.4b[2]\n"
+    "ldr q10, [%x[params], #0xb0]\n"
+    ".inst 0x6f84e238  // udot v24.4s, v17.16b, v4.4b[0]\n"
+    ".inst 0x6f84ea39  // udot v25.4s, v17.16b, v4.4b[2]\n"
+    ".inst 0x6f85e23a  // udot v26.4s, v17.16b, v5.4b[0]\n"
+    ".inst 0x6f85ea3b  // udot v27.4s, v17.16b, v5.4b[2]\n"
+    ".inst 0x6fa5e25c  // udot v28.4s, v18.16b, v5.4b[1]\n"
+    ".inst 0x6fa5ea5d  // udot v29.4s, v18.16b, v5.4b[3]\n"
+    ".inst 0x6fa6e25e  // udot v30.4s, v18.16b, v6.4b[1]\n"
+    ".inst 0x6fa6ea5f  // udot v31.4s, v18.16b, v6.4b[3]\n"
+    "ldr q11, [%x[params], #0xc0]\n"
+    ".inst 0x6fa4e218  // udot v24.4s, v16.16b, v4.4b[1]\n"
+    ".inst 0x6fa4ea19  // udot v25.4s, v16.16b, v4.4b[3]\n"
+    "sqrdmulh v24.4s, v24.4s, v12.4s\n"
+    ".inst 0x6fa5e21a  // udot v26.4s, v16.16b, v5.4b[1]\n"
+    ".inst 0x6fa5ea1b  // udot v27.4s, v16.16b, v5.4b[3]\n"
+    "sqrdmulh v25.4s, v25.4s, v12.4s\n"
+    ".inst 0x6f86e23c  // udot v28.4s, v17.16b, v6.4b[0]\n"
+    ".inst 0x6f86ea3d  // udot v29.4s, v17.16b, v6.4b[2]\n"
+    "sqrdmulh v26.4s, v26.4s, v12.4s\n"
+    ".inst 0x6f87e23e  // udot v30.4s, v17.16b, v7.4b[0]\n"
+    ".inst 0x6f87ea3f  // udot v31.4s, v17.16b, v7.4b[2]\n"
+    "ldr q8, [%x[params], #0x90]\n"
+    "sqrdmulh v27.4s, v27.4s, v12.4s\n"
+    ".inst 0x6fa6e21c  // udot v28.4s, v16.16b, v6.4b[1]\n"
+    ".inst 0x6fa6ea1d  // udot v29.4s, v16.16b, v6.4b[3]\n"
+    "and v19.16b, v24.16b, v21.16b\n"
+    ".inst 0x6fa7e21e  // udot v30.4s, v16.16b, v7.4b[1]\n"
+    ".inst 0x6fa7ea1f  // udot v31.4s, v16.16b, v7.4b[3]\n"
+    "ldr q9, [%x[params], #0xa0]\n"
+    "and v18.16b, v25.16b, v21.16b\n"
+    "and v17.16b, v26.16b, v21.16b\n"
+    "and v16.16b, v27.16b, v21.16b\n"
+    "add %x[params], %x[params], #0xd0\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v12.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v12.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v12.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v12.4s\n"
+    "sqadd v24.4s, v24.4s, v19.4s\n"
+    "sqadd v25.4s, v25.4s, v18.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "and v19.16b, v28.16b, v21.16b\n"
+    "and v18.16b, v29.16b, v21.16b\n"
+    "and v17.16b, v30.16b, v21.16b\n"
+    "and v16.16b, v31.16b, v21.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v24.4s, v24.4s, v21.4s\n"
+    "srshl v25.4s, v25.4s, v21.4s\n"
+    "srshl v26.4s, v26.4s, v21.4s\n"
+    "srshl v27.4s, v27.4s, v21.4s\n"
+    "srshl v28.4s, v28.4s, v21.4s\n"
+    "srshl v29.4s, v29.4s, v21.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v31.4s, v31.4s, v21.4s\n"
+    "add v24.4s, v24.4s, v13.4s\n"
+    "add v25.4s, v25.4s, v13.4s\n"
+    "add v26.4s, v26.4s, v13.4s\n"
+    "add v27.4s, v27.4s, v13.4s\n"
+    "add v28.4s, v28.4s, v13.4s\n"
+    "add v29.4s, v29.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "add v31.4s, v31.4s, v13.4s\n"
+    "smin v24.4s, v24.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v15.4s\n"
+    "smin v26.4s, v26.4s, v15.4s\n"
+    "smin v27.4s, v27.4s, v15.4s\n"
+    "smin v28.4s, v28.4s, v15.4s\n"
+    "smin v29.4s, v29.4s, v15.4s\n"
+    "smin v30.4s, v30.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s24, [x27, x28]\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s25, [x26, x28]\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s26, [x25, x28]\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s27, [x24, x28]\n"
+    "str s28, [x23, x28]\n"
+    "dup v24.4s, v22.s[0]\n"
+    "dup v25.4s, v22.s[1]\n"
+    "str s29, [x22, x28]\n"
+    "dup v26.4s, v22.s[2]\n"
+    "dup v27.4s, v22.s[3]\n"
+    "str s30, [x21, x28]\n"
+    "dup v28.4s, v23.s[0]\n"
+    "dup v29.4s, v23.s[1]\n"
+    "str s31, [x20, x28]\n"
+    "dup v30.4s, v23.s[2]\n"
+    "dup v31.4s, v23.s[3]\n"
+    "add x28, x28, #0x4\n"
+    "add v24.4s, v24.4s, v20.4s\n"
+    "add v25.4s, v25.4s, v20.4s\n"
+    "add v26.4s, v26.4s, v20.4s\n"
+    "add v27.4s, v27.4s, v20.4s\n"
+    "add v28.4s, v28.4s, v20.4s\n"
+    "add v29.4s, v29.4s, v20.4s\n"
+    "add v30.4s, v30.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v20.4s\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q21, [%x[params], #0x60]\n"
+    "ldr q20, [%x[params], #0x70]\n"
+    ".inst 0x6f80e118  // udot v24.4s, v8.16b, v0.4b[0]\n"
+    ".inst 0x6f80e919  // udot v25.4s, v8.16b, v0.4b[2]\n"
+    ".inst 0x6f81e11a  // udot v26.4s, v8.16b, v1.4b[0]\n"
+    ".inst 0x6f81e91b  // udot v27.4s, v8.16b, v1.4b[2]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "add x27, x27, x28\n"
+    ".inst 0x6fa0e138  // udot v24.4s, v9.16b, v0.4b[1]\n"
+    ".inst 0x6fa0e939  // udot v25.4s, v9.16b, v0.4b[3]\n"
+    "add x26, x26, x28\n"
+    "add x25, x25, x28\n"
+    ".inst 0x6fa1e13a  // udot v26.4s, v9.16b, v1.4b[1]\n"
+    ".inst 0x6fa1e93b  // udot v27.4s, v9.16b, v1.4b[3]\n"
+    "add x24, x24, x28\n"
+    "add x23, x23, x28\n"
+    ".inst 0x6f82e11c  // udot v28.4s, v8.16b, v2.4b[0]\n"
+    ".inst 0x6f82e91d  // udot v29.4s, v8.16b, v2.4b[2]\n"
+    "add x22, x22, x28\n"
+    "add x21, x21, x28\n"
+    ".inst 0x6f83e11e  // udot v30.4s, v8.16b, v3.4b[0]\n"
+    ".inst 0x6f83e91f  // udot v31.4s, v8.16b, v3.4b[2]\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    "add x20, x20, x28\n"
+    ".inst 0x6f81e158  // udot v24.4s, v10.16b, v1.4b[0]\n"
+    ".inst 0x6f81e959  // udot v25.4s, v10.16b, v1.4b[2]\n"
+    ".inst 0x6f82e15a  // udot v26.4s, v10.16b, v2.4b[0]\n"
+    ".inst 0x6f82e95b  // udot v27.4s, v10.16b, v2.4b[2]\n"
+    ".inst 0x6fa2e13c  // udot v28.4s, v9.16b, v2.4b[1]\n"
+    ".inst 0x6fa2e93d  // udot v29.4s, v9.16b, v2.4b[3]\n"
+    ".inst 0x6fa3e13e  // udot v30.4s, v9.16b, v3.4b[1]\n"
+    ".inst 0x6fa3e93f  // udot v31.4s, v9.16b, v3.4b[3]\n"
+    "ldr q16, [%x[params], #0x10]\n"
+    ".inst 0x6fa1e178  // udot v24.4s, v11.16b, v1.4b[1]\n"
+    ".inst 0x6fa1e979  // udot v25.4s, v11.16b, v1.4b[3]\n"
+    ".inst 0x6fa2e17a  // udot v26.4s, v11.16b, v2.4b[1]\n"
+    ".inst 0x6fa2e97b  // udot v27.4s, v11.16b, v2.4b[3]\n"
+    ".inst 0x6f83e15c  // udot v28.4s, v10.16b, v3.4b[0]\n"
+    ".inst 0x6f83e95d  // udot v29.4s, v10.16b, v3.4b[2]\n"
+    ".inst 0x6f84e15e  // udot v30.4s, v10.16b, v4.4b[0]\n"
+    ".inst 0x6f84e95f  // udot v31.4s, v10.16b, v4.4b[2]\n"
+    "ldr q19, [%x[params], #0x20]\n"
+    ".inst 0x6f82e238  // udot v24.4s, v17.16b, v2.4b[0]\n"
+    ".inst 0x6f82ea39  // udot v25.4s, v17.16b, v2.4b[2]\n"
+    ".inst 0x6f83e23a  // udot v26.4s, v17.16b, v3.4b[0]\n"
+    ".inst 0x6f83ea3b  // udot v27.4s, v17.16b, v3.4b[2]\n"
+    ".inst 0x6fa3e17c  // udot v28.4s, v11.16b, v3.4b[1]\n"
+    ".inst 0x6fa3e97d  // udot v29.4s, v11.16b, v3.4b[3]\n"
+    ".inst 0x6fa4e17e  // udot v30.4s, v11.16b, v4.4b[1]\n"
+    ".inst 0x6fa4e97f  // udot v31.4s, v11.16b, v4.4b[3]\n"
+    "ldr q18, [%x[params], #0x30]\n"
+    ".inst 0x6fa2e218  // udot v24.4s, v16.16b, v2.4b[1]\n"
+    ".inst 0x6fa2ea19  // udot v25.4s, v16.16b, v2.4b[3]\n"
+    ".inst 0x6fa3e21a  // udot v26.4s, v16.16b, v3.4b[1]\n"
+    ".inst 0x6fa3ea1b  // udot v27.4s, v16.16b, v3.4b[3]\n"
+    ".inst 0x6f84e23c  // udot v28.4s, v17.16b, v4.4b[0]\n"
+    ".inst 0x6f84ea3d  // udot v29.4s, v17.16b, v4.4b[2]\n"
+    ".inst 0x6f85e23e  // udot v30.4s, v17.16b, v5.4b[0]\n"
+    ".inst 0x6f85ea3f  // udot v31.4s, v17.16b, v5.4b[2]\n"
+    "ldr q17, [%x[params], #0x40]\n"
+    ".inst 0x6f83e278  // udot v24.4s, v19.16b, v3.4b[0]\n"
+    ".inst 0x6f83ea79  // udot v25.4s, v19.16b, v3.4b[2]\n"
+    ".inst 0x6f84e27a  // udot v26.4s, v19.16b, v4.4b[0]\n"
+    ".inst 0x6f84ea7b  // udot v27.4s, v19.16b, v4.4b[2]\n"
+    ".inst 0x6fa4e21c  // udot v28.4s, v16.16b, v4.4b[1]\n"
+    ".inst 0x6fa4ea1d  // udot v29.4s, v16.16b, v4.4b[3]\n"
+    ".inst 0x6fa5e21e  // udot v30.4s, v16.16b, v5.4b[1]\n"
+    ".inst 0x6fa5ea1f  // udot v31.4s, v16.16b, v5.4b[3]\n"
+    "ldr q16, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x80\n"
+    ".inst 0x6fa3e258  // udot v24.4s, v18.16b, v3.4b[1]\n"
+    ".inst 0x6fa3ea59  // udot v25.4s, v18.16b, v3.4b[3]\n"
+    ".inst 0x6fa4e25a  // udot v26.4s, v18.16b, v4.4b[1]\n"
+    ".inst 0x6fa4ea5b  // udot v27.4s, v18.16b, v4.4b[3]\n"
+    ".inst 0x6f85e27c  // udot v28.4s, v19.16b, v5.4b[0]\n"
+    ".inst 0x6f85ea7d  // udot v29.4s, v19.16b, v5.4b[2]\n"
+    ".inst 0x6f86e27e  // udot v30.4s, v19.16b, v6.4b[0]\n"
+    ".inst 0x6f86ea7f  // udot v31.4s, v19.16b, v6.4b[2]\n"
+    ".inst 0x6f84e238  // udot v24.4s, v17.16b, v4.4b[0]\n"
+    ".inst 0x6f84ea39  // udot v25.4s, v17.16b, v4.4b[2]\n"
+    ".inst 0x6f85e23a  // udot v26.4s, v17.16b, v5.4b[0]\n"
+    ".inst 0x6f85ea3b  // udot v27.4s, v17.16b, v5.4b[2]\n"
+    ".inst 0x6fa5e25c  // udot v28.4s, v18.16b, v5.4b[1]\n"
+    ".inst 0x6fa5ea5d  // udot v29.4s, v18.16b, v5.4b[3]\n"
+    ".inst 0x6fa6e25e  // udot v30.4s, v18.16b, v6.4b[1]\n"
+    ".inst 0x6fa6ea5f  // udot v31.4s, v18.16b, v6.4b[3]\n"
+    ".inst 0x6fa4e218  // udot v24.4s, v16.16b, v4.4b[1]\n"
+    ".inst 0x6fa4ea19  // udot v25.4s, v16.16b, v4.4b[3]\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    ".inst 0x6fa5e21a  // udot v26.4s, v16.16b, v5.4b[1]\n"
+    ".inst 0x6fa5ea1b  // udot v27.4s, v16.16b, v5.4b[3]\n"
+    "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+    ".inst 0x6f86e23c  // udot v28.4s, v17.16b, v6.4b[0]\n"
+    ".inst 0x6f86ea3d  // udot v29.4s, v17.16b, v6.4b[2]\n"
+    "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+    ".inst 0x6f87e23e  // udot v30.4s, v17.16b, v7.4b[0]\n"
+    ".inst 0x6f87ea3f  // udot v31.4s, v17.16b, v7.4b[2]\n"
+    "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+    ".inst 0x6fa6e21c  // udot v28.4s, v16.16b, v6.4b[1]\n"
+    ".inst 0x6fa6ea1d  // udot v29.4s, v16.16b, v6.4b[3]\n"
+    "and v19.16b, v24.16b, v20.16b\n"
+    ".inst 0x6fa7e21e  // udot v30.4s, v16.16b, v7.4b[1]\n"
+    ".inst 0x6fa7ea1f  // udot v31.4s, v16.16b, v7.4b[3]\n"
+    "and v18.16b, v25.16b, v20.16b\n"
+    "and v17.16b, v26.16b, v20.16b\n"
+    "and v16.16b, v27.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v19.4s\n"
+    "sqadd v25.4s, v25.4s, v18.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "and v19.16b, v28.16b, v20.16b\n"
+    "and v18.16b, v29.16b, v20.16b\n"
+    "and v17.16b, v30.16b, v20.16b\n"
+    "and v16.16b, v31.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v24.4s, v24.4s, v20.4s\n"
+    "srshl v25.4s, v25.4s, v20.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "srshl v27.4s, v27.4s, v20.4s\n"
+    "srshl v28.4s, v28.4s, v20.4s\n"
+    "srshl v29.4s, v29.4s, v20.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "add v24.4s, v24.4s, v13.4s\n"
+    "add v25.4s, v25.4s, v13.4s\n"
+    "add v26.4s, v26.4s, v13.4s\n"
+    "add v27.4s, v27.4s, v13.4s\n"
+    "add v28.4s, v28.4s, v13.4s\n"
+    "add v29.4s, v29.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "add v31.4s, v31.4s, v13.4s\n"
+    "smin v24.4s, v24.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v15.4s\n"
+    "smin v26.4s, v26.4s, v15.4s\n"
+    "smin v27.4s, v27.4s, v15.4s\n"
+    "smin v28.4s, v28.4s, v15.4s\n"
+    "smin v29.4s, v29.4s, v15.4s\n"
+    "smin v30.4s, v30.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "blt 3f\n"
+    "str s24, [x27, #0x0]\n"
+    "str s25, [x26, #0x0]\n"
+    "str s26, [x25, #0x0]\n"
+    "str s27, [x24, #0x0]\n"
+    "str s28, [x23, #0x0]\n"
+    "str s29, [x22, #0x0]\n"
+    "str s30, [x21, #0x0]\n"
+    "str s31, [x20, #0x0]\n"
+    "b 4f\n"
+    "3:"  // Tail: Oddments
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[0], [x27], #0x1\n"
+    "st1 { v25.b }[0], [x26], #0x1\n"
+    "st1 { v26.b }[0], [x25], #0x1\n"
+    "st1 { v27.b }[0], [x24], #0x1\n"
+    "st1 { v28.b }[0], [x23], #0x1\n"
+    "st1 { v29.b }[0], [x22], #0x1\n"
+    "st1 { v30.b }[0], [x21], #0x1\n"
+    "st1 { v31.b }[0], [x20], #0x1\n"
+    "beq 4f\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[1], [x27], #0x1\n"
+    "st1 { v25.b }[1], [x26], #0x1\n"
+    "st1 { v26.b }[1], [x25], #0x1\n"
+    "st1 { v27.b }[1], [x24], #0x1\n"
+    "st1 { v28.b }[1], [x23], #0x1\n"
+    "st1 { v29.b }[1], [x22], #0x1\n"
+    "st1 { v30.b }[1], [x21], #0x1\n"
+    "st1 { v31.b }[1], [x20], #0x1\n"
+    "beq 4f\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[2], [x27], #0x1\n"
+    "st1 { v25.b }[2], [x26], #0x1\n"
+    "st1 { v26.b }[2], [x25], #0x1\n"
+    "st1 { v27.b }[2], [x24], #0x1\n"
+    "st1 { v28.b }[2], [x23], #0x1\n"
+    "st1 { v29.b }[2], [x22], #0x1\n"
+    "st1 { v30.b }[2], [x21], #0x1\n"
+    "st1 { v31.b }[2], [x20], #0x1\n"
+    "beq 4f\n"
+    "st1 { v24.b }[3], [x27], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v25.b }[3], [x26], #0x1\n"
+    "st1 { v26.b }[3], [x25], #0x1\n"
+    "st1 { v27.b }[3], [x24], #0x1\n"
+    "st1 { v28.b }[3], [x23], #0x1\n"
+    "st1 { v29.b }[3], [x22], #0x1\n"
+    "st1 { v30.b }[3], [x21], #0x1\n"
+    "st1 { v31.b }[3], [x20], #0x1\n"
+    "4:"  // Tail: End
+    : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..1f2d211be2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const uint8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+struct a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = GenericDepthfirstMultiplierKernelStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+  a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+  : Parent(2, 8, arm_gemm::VLType::None)
+  {
+  }
+  Parent::KernelType kernel = a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..0770c126ec
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,1480 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const uint8_t *weights,
+  const int32_t *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const int32_t *per_channel_left_shifts,
+  const int32_t *per_channel_muls,
+  const int32_t *per_channel_right_shifts,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "lsr x10, %x[n_output_channels], #0x2\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v13.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v12.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v11.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "ld1r { v10.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v9.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "ld1r { v8.4s }, [x20]\n"
+    "mov x9, #0x0\n"
+    "cbz x10, 9f\n"
+    "1:"  // Output channel loop
+    "movi v31.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x20, x9, #0x2\n"
+    "ldr q31, [%x[bias], x20]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz %x[rq_mul_ptr], 3f\n"
+    "lsl x20, x9, #0x2\n"
+    "ldr q9, [%x[rq_mul_ptr], x20]\n"
+    "ldr q8, [%x[rq_right_shift_ptr], x20]\n"
+    "cbz %x[rq_left_shift_ptr], 3f\n"
+    "ldr q10, [%x[rq_left_shift_ptr], x20]\n"
+    "3:"  // Output channel loop: Load quantization parameters: Done
+    "ldr s5, [%x[weights]], #0x4\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr d0, [x21, #0x0]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "usubl v5.8h, v5.8b, v12.8b\n"
+    "cbz x23, 7f\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "usubl v7.8h, v7.8b, v12.8b\n"
+    "ldr d3, [x21, #0x0]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "beq 5f\n"
+    "4:"  // Output channel loop: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "subs x23, x23, #0x1\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x21, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "usubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "ldr d3, [x21, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "usubl v7.8h, v7.8b, v12.8b\n"
+    "bgt 4b\n"
+    "5:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 6f\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "b 8f\n"
+    "6:"  // Output channel loop: Odd tail
+    "ldp x20, x28, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x20, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldr d4, [x28, #0x0]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "usubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "b 8f\n"
+    "7:"  // Output channel loop: Single kernel point
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "8:"  // Output channel loop: Done
+    "add x9, x9, #0x4\n"
+    "cmp x9, x10, LSL #2\n"
+    "blt 1b\n"
+    "tst %x[n_output_channels], #0x3\n"
+    "beq 26f\n"
+    "9:"  // Output channel oddments
+    "movi v31.4s, #0x0\n"
+    "cbz %x[bias], 12f\n"
+    "add x20, %x[bias], x9, LSL #2\n"
+    "tbz %x[n_output_channels], #1, 10f\n"
+    "ld1 { v31.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #0, 11f\n"
+    "ld1 { v31.s }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Output channel oddments: Load bias: Bit 1: Unset
+    "ld1 { v31.s }[0], [x20]\n"
+    "11:"  // Output channel oddments: Load bias: Bit 1: End
+    "12:"  // Output channel oddments: Load bias: Done
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz %x[rq_mul_ptr], 18f\n"
+    "add x22, %x[rq_mul_ptr], x9, LSL #2\n"
+    "add x21, %x[rq_right_shift_ptr], x9, LSL #2\n"
+    "add x20, %x[rq_left_shift_ptr], x9, LSL #2\n"
+    "cbz %x[rq_left_shift_ptr], 15f\n"
+    "tbz %x[n_output_channels], #1, 13f\n"
+    "ld1 { v9.d }[0], [x22], #0x8\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #0, 14f\n"
+    "ld1 { v9.s }[2], [x22], #0x4\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 14f\n"
+    "13:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "14:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
+    "b 18f\n"
+    "15:"  // Output channel oddments: Load quantization parameters: No left shift
+    "tbz %x[n_output_channels], #1, 16f\n"
+    "ld1 { v9.d }[0], [x22], #0x8\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "tbz %x[n_output_channels], #0, 17f\n"
+    "ld1 { v9.s }[2], [x22], #0x4\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "b 17f\n"
+    "16:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "17:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
+    "18:"  // Output channel oddments: Load quantization parameters: Done
+    "ldr s5, [%x[weights]], #0x4\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr d0, [x21, #0x0]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "usubl v5.8h, v5.8b, v12.8b\n"
+    "cbz x23, 22f\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "usubl v7.8h, v7.8b, v12.8b\n"
+    "ldr d3, [x21, #0x0]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "beq 20f\n"
+    "19:"  // Output channel oddments: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "subs x23, x23, #0x1\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x21, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "usubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "ldr d3, [x21, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "usubl v7.8h, v7.8b, v12.8b\n"
+    "bgt 19b\n"
+    "20:"  // Output channel oddments: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 21f\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "b 23f\n"
+    "21:"  // Output channel oddments: Odd tail
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d2, [x21, #0x0]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d1, [x20, #0x0]\n"
+    "ldr s0, [%x[weights]], #0x4\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "usubl v0.8h, v0.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "smlal v16.4s, v0.4h, v2.h[0]\n"
+    "smlal v17.4s, v0.4h, v2.h[1]\n"
+    "smlal v18.4s, v0.4h, v2.h[2]\n"
+    "smlal v19.4s, v0.4h, v2.h[3]\n"
+    "smlal v20.4s, v0.4h, v2.h[4]\n"
+    "smlal v21.4s, v0.4h, v2.h[5]\n"
+    "smlal v22.4s, v0.4h, v2.h[6]\n"
+    "smlal v23.4s, v0.4h, v2.h[7]\n"
+    "smlal v24.4s, v0.4h, v1.h[0]\n"
+    "smlal v25.4s, v0.4h, v1.h[1]\n"
+    "smlal v26.4s, v0.4h, v1.h[2]\n"
+    "smlal v27.4s, v0.4h, v1.h[3]\n"
+    "smlal v28.4s, v0.4h, v1.h[4]\n"
+    "smlal v29.4s, v0.4h, v1.h[5]\n"
+    "smlal v30.4s, v0.4h, v1.h[6]\n"
+    "smlal v31.4s, v0.4h, v1.h[7]\n"
+    "b 23f\n"
+    "22:"  // Output channel oddments: Single kernel point
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "23:"  // Output channel oddments: Done
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "tbz %x[n_output_channels], #1, 24f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.h }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.h }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.h }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.h }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.h }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.h }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.h }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.h }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "add x9, x9, #0x2\n"
+    "st1 { v24.h }[0], [x27]\n"
+    "st1 { v25.h }[0], [x26]\n"
+    "st1 { v26.h }[0], [x25]\n"
+    "st1 { v27.h }[0], [x24]\n"
+    "st1 { v28.h }[0], [x23]\n"
+    "st1 { v29.h }[0], [x22]\n"
+    "st1 { v30.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "tbz %x[n_output_channels], #0, 25f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.b }[2], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.b }[2], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.b }[2], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.b }[2], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.b }[2], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.b }[2], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.b }[2], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.b }[2], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "st1 { v24.b }[2], [x27]\n"
+    "st1 { v25.b }[2], [x26]\n"
+    "st1 { v26.b }[2], [x25]\n"
+    "st1 { v27.b }[2], [x24]\n"
+    "st1 { v28.b }[2], [x23]\n"
+    "st1 { v29.b }[2], [x22]\n"
+    "st1 { v30.b }[2], [x21]\n"
+    "st1 { v31.b }[2], [x20]\n"
+    "b 25f\n"
+    "24:"  // Output channel oddments: Done: Store: Bit 1: Unset
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.b }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.b }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.b }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.b }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.b }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.b }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.b }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.b }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "st1 { v24.b }[0], [x27]\n"
+    "st1 { v25.b }[0], [x26]\n"
+    "st1 { v26.b }[0], [x25]\n"
+    "st1 { v27.b }[0], [x24]\n"
+    "st1 { v28.b }[0], [x23]\n"
+    "st1 { v29.b }[0], [x22]\n"
+    "st1 { v30.b }[0], [x21]\n"
+    "st1 { v31.b }[0], [x20]\n"
+    "25:"  // Output channel oddments: Done: Store: Bit 1: End
+    "26:"  // Done
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..db73c88187
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const uint8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d1872c90f8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1164 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x16, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x15, x16, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v18.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_minval]\n"
+    "ld1r { v5.8h }, [x21]\n"
+    "ld1r { v14.8h }, [x20]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "mov x14, #0x0\n"
+    "ld1r { v12.8h }, [x20]\n"
+    "mov x13, #0x0\n"
+    "add x12, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x11, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x10, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x9, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x28, x27, [x22, #0x0]\n"
+    "ldp x26, x25, [x22, #0x10]\n"
+    "cbz x15, 3f\n"
+    "ldr d19, [x11, #0x0]\n"
+    "ldr d7, [x11, #0x8]\n"
+    "subs x15, x15, #0x1\n"
+    "usubl v19.8h, v19.8b, v18.8b\n"
+    "ldr d1, [x11, #0x10]\n"
+    "ldr d17, [x11, #0x18]\n"
+    "usubl v7.8h, v7.8b, v18.8b\n"
+    "usubl v1.8h, v1.8b, v18.8b\n"
+    "ldr d8, [x11, #0x20]\n"
+    "ldr d31, [x11, #0x28]\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "ldr d29, [x11, #0x30]\n"
+    "ldr d16, [x11, #0x38]\n"
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "ldr d4, [x11, #0x40]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "ldr q28, [x20, #0x0]\n"
+    "ldr q9, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x23, x22, [x12, #0x0]\n"
+    "ldp x21, x20, [x12, #0x10]\n"
+    "mov v3.16b, v28.16b\n"
+    "mov v30.16b, v9.16b\n"
+    "ldr d23, [x23, x14]\n"
+    "ldr d10, [x22, x14]\n"
+    "mov v0.16b, v28.16b\n"
+    "mov v22.16b, v9.16b\n"
+    "ldr d11, [x21, x14]\n"
+    "ldr d13, [x20, x14]\n"
+    "mov v6.16b, v28.16b\n"
+    "mov v2.16b, v9.16b\n"
+    "ldr x20, [x12, #0x20]\n"
+    "ldr d27, [x20, x14]\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "ushll v13.8h, v13.8b, #0x0\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr q24, [x10, #0x0]\n"
+    "ldr q25, [x9, #0x0]\n"
+    "smlal v28.4s, v23.4h, v8.4h\n"
+    "smlal2 v9.4s, v23.8h, v8.8h\n"
+    "ldr q20, [x10, #0x10]\n"
+    "ldr q26, [x9, #0x10]\n"
+    "smlal v28.4s, v10.4h, v19.4h\n"
+    "smlal v3.4s, v23.4h, v17.4h\n"
+    "ldr x20, [x12, #0x28]\n"
+    "ldr d21, [x20, x14]\n"
+    "smlal v0.4s, v23.4h, v7.4h\n"
+    "smlal v6.4s, v23.4h, v19.4h\n"
+    "smlal2 v9.4s, v10.8h, v19.8h\n"
+    "ldr x20, [x12, #0x38]\n"
+    "ldr d10, [x20, x14]\n"
+    "smlal v28.4s, v13.4h, v31.4h\n"
+    "smlal2 v30.4s, v23.8h, v17.8h\n"
+    "smlal2 v22.4s, v23.8h, v7.8h\n"
+    "ldr x20, [x12, #0x30]\n"
+    "ldr d15, [x20, x14]\n"
+    "smlal2 v2.4s, v23.8h, v19.8h\n"
+    "smlal v3.4s, v11.4h, v1.4h\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "ldr x20, [x12, #0x40]\n"
+    "ldr d23, [x20, x14]\n"
+    "smlal v0.4s, v13.4h, v1.4h\n"
+    "smlal v6.4s, v13.4h, v7.4h\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal2 v9.4s, v13.8h, v31.8h\n"
+    "smlal v28.4s, v27.4h, v16.4h\n"
+    "ldr x20, [x12, #0x48]\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal2 v30.4s, v11.8h, v1.8h\n"
+    "ldr d11, [x20, x14]\n"
+    "smlal2 v22.4s, v13.8h, v1.8h\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "smlal2 v2.4s, v13.8h, v7.8h\n"
+    "smlal v3.4s, v13.4h, v8.4h\n"
+    "ldr x21, [x12, #0x50]\n"
+    "ldr x20, [x12, #0x58]\n"
+    "smlal v0.4s, v21.4h, v29.4h\n"
+    "smlal v6.4s, v27.4h, v17.4h\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "ldr x24, [x12, #0x60]\n"
+    "smlal2 v9.4s, v27.8h, v16.8h\n"
+    "smlal v28.4s, v10.4h, v7.4h\n"
+    "ldr x23, [x12, #0x68]\n"
+    "ldr x22, [x12, #0x70]\n"
+    "smlal2 v30.4s, v13.8h, v8.8h\n"
+    "ldr d13, [x21, x14]\n"
+    "smlal2 v22.4s, v21.8h, v29.8h\n"
+    "ldr d21, [x20, x14]\n"
+    "smlal2 v2.4s, v27.8h, v17.8h\n"
+    "smlal v3.4s, v27.4h, v29.4h\n"
+    "ushll v13.8h, v13.8b, #0x0\n"
+    "ldr x21, [x12, #0x78]\n"
+    "smlal v0.4s, v27.4h, v8.4h\n"
+    "smlal v6.4s, v15.4h, v4.4h\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v9.4s, v10.8h, v7.8h\n"
+    "smlal v28.4s, v23.4h, v1.4h\n"
+    "add x11, x11, #0x48\n"
+    "subs x15, x15, #0x1\n"
+    "smlal2 v30.4s, v27.8h, v29.8h\n"
+    "smlal2 v22.4s, v27.8h, v8.8h\n"
+    "ldr d27, [x24, x14]\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "smlal2 v2.4s, v15.8h, v4.8h\n"
+    "ldr d15, [x23, x14]\n"
+    "smlal v3.4s, v10.4h, v19.4h\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal v0.4s, v11.4h, v31.4h\n"
+    "smlal v6.4s, v11.4h, v8.4h\n"
+    "add x10, x10, #0x20\n"
+    "add x9, x9, #0x20\n"
+    "smlal2 v9.4s, v23.8h, v1.8h\n"
+    "smlal v28.4s, v11.4h, v4.4h\n"
+    "smlal2 v30.4s, v10.8h, v19.8h\n"
+    "ldr d10, [x22, x14]\n"
+    "smlal2 v22.4s, v11.8h, v31.8h\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal2 v2.4s, v11.8h, v8.8h\n"
+    "ldr d8, [x21, x14]\n"
+    "smlal v3.4s, v23.4h, v7.4h\n"
+    "ushll v8.8h, v8.8b, #0x0\n"
+    "smlal v0.4s, v13.4h, v19.4h\n"
+    "smlal v6.4s, v21.4h, v1.4h\n"
+    "add x14, x14, #0x8\n"
+    "smlal2 v9.4s, v11.8h, v4.8h\n"
+    "smlal v28.4s, v13.4h, v17.4h\n"
+    "smlal2 v30.4s, v23.8h, v7.8h\n"
+    "smlal2 v22.4s, v13.8h, v19.8h\n"
+    "smlal2 v2.4s, v21.8h, v1.8h\n"
+    "smlal v3.4s, v11.4h, v16.4h\n"
+    "smlal v0.4s, v27.4h, v17.4h\n"
+    "smlal v6.4s, v15.4h, v31.4h\n"
+    "smlal2 v9.4s, v13.8h, v17.8h\n"
+    "smlal v28.4s, v27.4h, v29.4h\n"
+    "sqrdmulh v28.4s, v28.4s, v24.4s\n"
+    "smlal2 v30.4s, v11.8h, v16.8h\n"
+    "smlal2 v22.4s, v27.8h, v17.8h\n"
+    "and v17.16b, v28.16b, v25.16b\n"
+    "smlal2 v2.4s, v15.8h, v31.8h\n"
+    "smlal v3.4s, v21.4h, v31.4h\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smlal v0.4s, v10.4h, v16.4h\n"
+    "smlal v6.4s, v10.4h, v29.4h\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "smlal2 v9.4s, v27.8h, v29.8h\n"
+    "smlal2 v30.4s, v21.8h, v31.8h\n"
+    "sqrdmulh v9.4s, v9.4s, v20.4s\n"
+    "smlal2 v22.4s, v10.8h, v16.8h\n"
+    "smlal2 v2.4s, v10.8h, v29.8h\n"
+    "and v23.16b, v9.16b, v26.16b\n"
+    "smlal v3.4s, v15.4h, v4.4h\n"
+    "smlal v0.4s, v8.4h, v4.4h\n"
+    "sqrdmulh v3.4s, v3.4s, v24.4s\n"
+    "smlal v6.4s, v8.4h, v16.4h\n"
+    "smlal2 v30.4s, v15.8h, v4.8h\n"
+    "sqrdmulh v0.4s, v0.4s, v24.4s\n"
+    "smlal2 v22.4s, v8.8h, v4.8h\n"
+    "smlal2 v2.4s, v8.8h, v16.8h\n"
+    "sqrdmulh v6.4s, v6.4s, v24.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v8.16b, v3.16b, v25.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v20.4s\n"
+    "and v11.16b, v0.16b, v25.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+    "and v29.16b, v6.16b, v25.16b\n"
+    "sqrdmulh v2.4s, v2.4s, v20.4s\n"
+    "sqadd v9.4s, v9.4s, v23.4s\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "and v13.16b, v30.16b, v26.16b\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "and v21.16b, v22.16b, v26.16b\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "and v23.16b, v2.16b, v26.16b\n"
+    "sqadd v3.4s, v3.4s, v8.4s\n"
+    "sshr v13.4s, v13.4s, #0x1f\n"
+    "sqadd v0.4s, v0.4s, v11.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sqadd v6.4s, v6.4s, v29.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "srshl v28.4s, v28.4s, v25.4s\n"
+    "srshl v3.4s, v3.4s, v25.4s\n"
+    "sqadd v30.4s, v30.4s, v13.4s\n"
+    "srshl v0.4s, v0.4s, v25.4s\n"
+    "sqadd v22.4s, v22.4s, v21.4s\n"
+    "srshl v6.4s, v6.4s, v25.4s\n"
+    "sqadd v2.4s, v2.4s, v23.4s\n"
+    "srshl v9.4s, v9.4s, v26.4s\n"
+    "sqxtn v28.4h, v28.4s\n"
+    "srshl v30.4s, v30.4s, v26.4s\n"
+    "sqxtn v3.4h, v3.4s\n"
+    "srshl v22.4s, v22.4s, v26.4s\n"
+    "sqxtn v0.4h, v0.4s\n"
+    "srshl v2.4s, v2.4s, v26.4s\n"
+    "sqxtn v6.4h, v6.4s\n"
+    "sqxtn2 v28.8h, v9.4s\n"
+    "sqxtn2 v3.8h, v30.4s\n"
+    "sqxtn2 v0.8h, v22.4s\n"
+    "sqxtn2 v6.8h, v2.4s\n"
+    "sqadd v28.8h, v28.8h, v5.8h\n"
+    "sqadd v3.8h, v3.8h, v5.8h\n"
+    "sqadd v0.8h, v0.8h, v5.8h\n"
+    "sqadd v6.8h, v6.8h, v5.8h\n"
+    "smax v28.8h, v28.8h, v14.8h\n"
+    "smax v3.8h, v3.8h, v14.8h\n"
+    "smax v0.8h, v0.8h, v14.8h\n"
+    "smax v6.8h, v6.8h, v14.8h\n"
+    "smin v28.8h, v28.8h, v12.8h\n"
+    "smin v3.8h, v3.8h, v12.8h\n"
+    "smin v0.8h, v0.8h, v12.8h\n"
+    "smin v6.8h, v6.8h, v12.8h\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str d28, [x28, x13]\n"
+    "uzp1 v3.16b, v3.16b, v3.16b\n"
+    "uzp1 v0.16b, v0.16b, v0.16b\n"
+    "str d3, [x27, x13]\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "str d0, [x26, x13]\n"
+    "str d6, [x25, x13]\n"
+    "ldr q28, [x20, #0x0]\n"
+    "ldr q9, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d19, [x11, #0x0]\n"
+    "ldr d7, [x11, #0x8]\n"
+    "add x13, x13, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d1, [x11, #0x10]\n"
+    "ldr d17, [x11, #0x18]\n"
+    "mov v3.16b, v28.16b\n"
+    "mov v30.16b, v9.16b\n"
+    "ldr d8, [x11, #0x20]\n"
+    "ldr d31, [x11, #0x28]\n"
+    "mov v0.16b, v28.16b\n"
+    "mov v22.16b, v9.16b\n"
+    "ldr d29, [x11, #0x30]\n"
+    "ldr d16, [x11, #0x38]\n"
+    "mov v6.16b, v28.16b\n"
+    "mov v2.16b, v9.16b\n"
+    "ldr d4, [x11, #0x40]\n"
+    "ldp x23, x22, [x12, #0x0]\n"
+    "usubl v19.8h, v19.8b, v18.8b\n"
+    "usubl v7.8h, v7.8b, v18.8b\n"
+    "ldp x21, x20, [x12, #0x10]\n"
+    "ldr d23, [x23, x14]\n"
+    "usubl v1.8h, v1.8b, v18.8b\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "ldr d10, [x22, x14]\n"
+    "ldr d11, [x21, x14]\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "ldr d13, [x20, x14]\n"
+    "ldr x20, [x12, #0x20]\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "ldr d27, [x20, x14]\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "ushll v13.8h, v13.8b, #0x0\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q26, [x10, #0x0]\n"
+    "ldr q25, [x9, #0x0]\n"
+    "smlal v28.4s, v23.4h, v8.4h\n"
+    "smlal2 v9.4s, v23.8h, v8.8h\n"
+    "ldr q24, [x10, #0x10]\n"
+    "ldr q20, [x9, #0x10]\n"
+    "smlal v28.4s, v10.4h, v19.4h\n"
+    "smlal v3.4s, v23.4h, v17.4h\n"
+    "ldr x20, [x12, #0x28]\n"
+    "ldr d21, [x20, x14]\n"
+    "smlal v0.4s, v23.4h, v7.4h\n"
+    "smlal v6.4s, v23.4h, v19.4h\n"
+    "smlal2 v9.4s, v10.8h, v19.8h\n"
+    "ldr x20, [x12, #0x38]\n"
+    "ldr d15, [x20, x14]\n"
+    "smlal v28.4s, v13.4h, v31.4h\n"
+    "smlal2 v30.4s, v23.8h, v17.8h\n"
+    "smlal2 v22.4s, v23.8h, v7.8h\n"
+    "ldr x20, [x12, #0x30]\n"
+    "ldr d10, [x20, x14]\n"
+    "smlal2 v2.4s, v23.8h, v19.8h\n"
+    "smlal v3.4s, v11.4h, v1.4h\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "ldr x20, [x12, #0x40]\n"
+    "ldr d23, [x20, x14]\n"
+    "smlal v0.4s, v13.4h, v1.4h\n"
+    "smlal v6.4s, v13.4h, v7.4h\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal2 v9.4s, v13.8h, v31.8h\n"
+    "smlal v28.4s, v27.4h, v16.4h\n"
+    "ldr x20, [x12, #0x48]\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal2 v30.4s, v11.8h, v1.8h\n"
+    "ldr d11, [x20, x14]\n"
+    "smlal2 v22.4s, v13.8h, v1.8h\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "smlal2 v2.4s, v13.8h, v7.8h\n"
+    "smlal v3.4s, v13.4h, v8.4h\n"
+    "ldr x24, [x12, #0x50]\n"
+    "ldr x20, [x12, #0x58]\n"
+    "smlal v0.4s, v21.4h, v29.4h\n"
+    "smlal v6.4s, v27.4h, v17.4h\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "ldr x23, [x12, #0x60]\n"
+    "smlal2 v9.4s, v27.8h, v16.8h\n"
+    "smlal v28.4s, v15.4h, v7.4h\n"
+    "ldr x22, [x12, #0x68]\n"
+    "ldr x21, [x12, #0x70]\n"
+    "smlal2 v30.4s, v13.8h, v8.8h\n"
+    "ldr d13, [x24, x14]\n"
+    "smlal2 v22.4s, v21.8h, v29.8h\n"
+    "ldr d21, [x20, x14]\n"
+    "smlal2 v2.4s, v27.8h, v17.8h\n"
+    "smlal v3.4s, v27.4h, v29.4h\n"
+    "ushll v13.8h, v13.8b, #0x0\n"
+    "ldr x20, [x12, #0x78]\n"
+    "smlal v0.4s, v27.4h, v8.4h\n"
+    "smlal v6.4s, v10.4h, v4.4h\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "tst x16, #0x7\n"
+    "smlal2 v9.4s, v15.8h, v7.8h\n"
+    "smlal v28.4s, v23.4h, v1.4h\n"
+    "add x10, x10, #0x20\n"
+    "add x9, x9, #0x20\n"
+    "smlal2 v30.4s, v27.8h, v29.8h\n"
+    "smlal2 v22.4s, v27.8h, v8.8h\n"
+    "ldr d27, [x23, x14]\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "smlal2 v2.4s, v10.8h, v4.8h\n"
+    "ldr d10, [x22, x14]\n"
+    "smlal v3.4s, v15.4h, v19.4h\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal v0.4s, v11.4h, v31.4h\n"
+    "smlal v6.4s, v11.4h, v8.4h\n"
+    "smlal2 v9.4s, v23.8h, v1.8h\n"
+    "smlal v28.4s, v11.4h, v4.4h\n"
+    "smlal2 v30.4s, v15.8h, v19.8h\n"
+    "ldr d15, [x21, x14]\n"
+    "smlal2 v22.4s, v11.8h, v31.8h\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal2 v2.4s, v11.8h, v8.8h\n"
+    "ldr d8, [x20, x14]\n"
+    "smlal v3.4s, v23.4h, v7.4h\n"
+    "ushll v8.8h, v8.8b, #0x0\n"
+    "smlal v0.4s, v13.4h, v19.4h\n"
+    "smlal v6.4s, v21.4h, v1.4h\n"
+    "add x14, x14, #0x8\n"
+    "smlal2 v9.4s, v11.8h, v4.8h\n"
+    "smlal v28.4s, v13.4h, v17.4h\n"
+    "smlal2 v30.4s, v23.8h, v7.8h\n"
+    "smlal2 v22.4s, v13.8h, v19.8h\n"
+    "smlal2 v2.4s, v21.8h, v1.8h\n"
+    "smlal v3.4s, v11.4h, v16.4h\n"
+    "smlal v0.4s, v27.4h, v17.4h\n"
+    "smlal v6.4s, v10.4h, v31.4h\n"
+    "smlal2 v9.4s, v13.8h, v17.8h\n"
+    "smlal v28.4s, v27.4h, v29.4h\n"
+    "sqrdmulh v28.4s, v28.4s, v26.4s\n"
+    "smlal2 v30.4s, v11.8h, v16.8h\n"
+    "smlal2 v22.4s, v27.8h, v17.8h\n"
+    "and v1.16b, v28.16b, v25.16b\n"
+    "smlal2 v2.4s, v10.8h, v31.8h\n"
+    "smlal v3.4s, v21.4h, v31.4h\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "smlal v0.4s, v15.4h, v16.4h\n"
+    "smlal v6.4s, v15.4h, v29.4h\n"
+    "sqadd v28.4s, v28.4s, v1.4s\n"
+    "smlal2 v9.4s, v27.8h, v29.8h\n"
+    "smlal2 v30.4s, v21.8h, v31.8h\n"
+    "sqrdmulh v9.4s, v9.4s, v24.4s\n"
+    "smlal2 v22.4s, v15.8h, v16.8h\n"
+    "smlal2 v2.4s, v15.8h, v29.8h\n"
+    "and v27.16b, v9.16b, v20.16b\n"
+    "smlal v3.4s, v10.4h, v4.4h\n"
+    "smlal v0.4s, v8.4h, v4.4h\n"
+    "sqrdmulh v3.4s, v3.4s, v26.4s\n"
+    "smlal v6.4s, v8.4h, v16.4h\n"
+    "smlal2 v30.4s, v10.8h, v4.8h\n"
+    "sqrdmulh v0.4s, v0.4s, v26.4s\n"
+    "smlal2 v22.4s, v8.8h, v4.8h\n"
+    "smlal2 v2.4s, v8.8h, v16.8h\n"
+    "sqrdmulh v6.4s, v6.4s, v26.4s\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "and v16.16b, v3.16b, v25.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v24.4s\n"
+    "and v4.16b, v0.16b, v25.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+    "and v17.16b, v6.16b, v25.16b\n"
+    "sqrdmulh v2.4s, v2.4s, v24.4s\n"
+    "sqadd v9.4s, v9.4s, v27.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v8.16b, v30.16b, v20.16b\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "and v26.16b, v22.16b, v20.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v11.16b, v2.16b, v20.16b\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "sqadd v0.4s, v0.4s, v4.4s\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "sqadd v6.4s, v6.4s, v17.4s\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "srshl v28.4s, v28.4s, v25.4s\n"
+    "srshl v3.4s, v3.4s, v25.4s\n"
+    "sqadd v30.4s, v30.4s, v8.4s\n"
+    "srshl v0.4s, v0.4s, v25.4s\n"
+    "sqadd v22.4s, v22.4s, v26.4s\n"
+    "srshl v6.4s, v6.4s, v25.4s\n"
+    "sqadd v2.4s, v2.4s, v11.4s\n"
+    "srshl v9.4s, v9.4s, v20.4s\n"
+    "sqxtn v28.4h, v28.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "sqxtn v3.4h, v3.4s\n"
+    "srshl v22.4s, v22.4s, v20.4s\n"
+    "sqxtn v0.4h, v0.4s\n"
+    "srshl v2.4s, v2.4s, v20.4s\n"
+    "sqxtn v6.4h, v6.4s\n"
+    "sqxtn2 v28.8h, v9.4s\n"
+    "sqxtn2 v3.8h, v30.4s\n"
+    "sqxtn2 v0.8h, v22.4s\n"
+    "sqxtn2 v6.8h, v2.4s\n"
+    "sqadd v28.8h, v28.8h, v5.8h\n"
+    "sqadd v3.8h, v3.8h, v5.8h\n"
+    "sqadd v0.8h, v0.8h, v5.8h\n"
+    "sqadd v6.8h, v6.8h, v5.8h\n"
+    "smax v28.8h, v28.8h, v14.8h\n"
+    "smax v3.8h, v3.8h, v14.8h\n"
+    "smax v0.8h, v0.8h, v14.8h\n"
+    "smax v6.8h, v6.8h, v14.8h\n"
+    "smin v28.8h, v28.8h, v12.8h\n"
+    "smin v3.8h, v3.8h, v12.8h\n"
+    "smin v0.8h, v0.8h, v12.8h\n"
+    "smin v6.8h, v6.8h, v12.8h\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str d28, [x28, x13]\n"
+    "uzp1 v3.16b, v3.16b, v3.16b\n"
+    "uzp1 v0.16b, v0.16b, v0.16b\n"
+    "str d3, [x27, x13]\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "str d0, [x26, x13]\n"
+    "str d6, [x25, x13]\n"
+    "add x13, x13, #0x8\n"
+    "beq 64f\n"
+    "add x11, x11, #0x48\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x16, #2, 5f\n"
+    "ld1 { v28.4s }, [x20], #0x10\n"
+    "tbz x16, #1, 4f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz x16, #0, 7f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x16, #0, 7f\n"
+    "ld1 { v9.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x16, #1, 6f\n"
+    "ld1 { v28.d }[0], [x20], #0x8\n"
+    "tbz x16, #0, 7f\n"
+    "ld1 { v28.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 7f\n"
+    "ld1 { v28.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d19, [x11, #0x0]\n"
+    "ldr d7, [x11, #0x8]\n"
+    "mov v3.16b, v28.16b\n"
+    "mov v30.16b, v9.16b\n"
+    "ldr d1, [x11, #0x10]\n"
+    "ldr d17, [x11, #0x18]\n"
+    "mov v0.16b, v28.16b\n"
+    "mov v22.16b, v9.16b\n"
+    "ldr d8, [x11, #0x20]\n"
+    "ldr d31, [x11, #0x28]\n"
+    "mov v6.16b, v28.16b\n"
+    "mov v2.16b, v9.16b\n"
+    "ldr d29, [x11, #0x30]\n"
+    "ldr d16, [x11, #0x38]\n"
+    "usubl v19.8h, v19.8b, v18.8b\n"
+    "usubl v7.8h, v7.8b, v18.8b\n"
+    "ldr d4, [x11, #0x40]\n"
+    "ldp x24, x23, [x12, #0x0]\n"
+    "usubl v1.8h, v1.8b, v18.8b\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "ldp x22, x21, [x12, #0x10]\n"
+    "ldr x20, [x12, #0x20]\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "add x24, x24, x14\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 9f\n"
+    "ld1 { v23.s }[0], [x24], #0x4\n"
+    "ld1 { v10.s }[0], [x23], #0x4\n"
+    "ld1 { v11.s }[0], [x22], #0x4\n"
+    "ld1 { v13.s }[0], [x21], #0x4\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 8f\n"
+    "ld1 { v23.h }[2], [x24], #0x2\n"
+    "ld1 { v10.h }[2], [x23], #0x2\n"
+    "ld1 { v11.h }[2], [x22], #0x2\n"
+    "ld1 { v13.h }[2], [x21], #0x2\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 11f\n"
+    "ld1 { v23.b }[6], [x24]\n"
+    "ld1 { v10.b }[6], [x23]\n"
+    "ld1 { v11.b }[6], [x22]\n"
+    "ld1 { v13.b }[6], [x21]\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x16, #0, 11f\n"
+    "ld1 { v23.b }[4], [x24]\n"
+    "ld1 { v10.b }[4], [x23]\n"
+    "ld1 { v11.b }[4], [x22]\n"
+    "ld1 { v13.b }[4], [x21]\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x16, #1, 10f\n"
+    "ld1 { v23.h }[0], [x24], #0x2\n"
+    "ld1 { v10.h }[0], [x23], #0x2\n"
+    "ld1 { v11.h }[0], [x22], #0x2\n"
+    "ld1 { v13.h }[0], [x21], #0x2\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 11f\n"
+    "ld1 { v23.b }[2], [x24]\n"
+    "ld1 { v10.b }[2], [x23]\n"
+    "ld1 { v11.b }[2], [x22]\n"
+    "ld1 { v13.b }[2], [x21]\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 11f\n"
+    "ld1 { v23.b }[0], [x24]\n"
+    "ld1 { v10.b }[0], [x23]\n"
+    "ld1 { v11.b }[0], [x22]\n"
+    "ld1 { v13.b }[0], [x21]\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "smlal v28.4s, v23.4h, v8.4h\n"
+    "smlal2 v9.4s, v23.8h, v8.8h\n"
+    "ldr x20, [x12, #0x28]\n"
+    "smlal v3.4s, v23.4h, v17.4h\n"
+    "smlal2 v30.4s, v23.8h, v17.8h\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "smlal v0.4s, v23.4h, v7.4h\n"
+    "smlal2 v22.4s, v23.8h, v7.8h\n"
+    "add x20, x20, x14\n"
+    "smlal v6.4s, v23.4h, v19.4h\n"
+    "smlal2 v2.4s, v23.8h, v19.8h\n"
+    "ushll v13.8h, v13.8b, #0x0\n"
+    "smlal v28.4s, v10.4h, v19.4h\n"
+    "smlal2 v9.4s, v10.8h, v19.8h\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "smlal v3.4s, v11.4h, v1.4h\n"
+    "smlal2 v30.4s, v11.8h, v1.8h\n"
+    "smlal v28.4s, v13.4h, v31.4h\n"
+    "smlal2 v9.4s, v13.8h, v31.8h\n"
+    "smlal v3.4s, v13.4h, v8.4h\n"
+    "smlal2 v30.4s, v13.8h, v8.8h\n"
+    "smlal v0.4s, v13.4h, v1.4h\n"
+    "smlal2 v22.4s, v13.8h, v1.8h\n"
+    "smlal v6.4s, v13.4h, v7.4h\n"
+    "smlal2 v2.4s, v13.8h, v7.8h\n"
+    "tbz x16, #2, 13f\n"
+    "ld1 { v26.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 12f\n"
+    "ld1 { v26.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 15f\n"
+    "ld1 { v26.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 15f\n"
+    "ld1 { v26.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x16, #1, 14f\n"
+    "ld1 { v26.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 15f\n"
+    "ld1 { v26.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 15f\n"
+    "ld1 { v26.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (3, 0): Bit 2: End
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "smlal v0.4s, v26.4h, v29.4h\n"
+    "smlal2 v22.4s, v26.8h, v29.8h\n"
+    "ldr x20, [x12, #0x30]\n"
+    "smlal v28.4s, v27.4h, v16.4h\n"
+    "smlal2 v9.4s, v27.8h, v16.8h\n"
+    "add x20, x20, x14\n"
+    "smlal v3.4s, v27.4h, v29.4h\n"
+    "smlal2 v30.4s, v27.8h, v29.8h\n"
+    "smlal v0.4s, v27.4h, v8.4h\n"
+    "smlal2 v22.4s, v27.8h, v8.8h\n"
+    "smlal v6.4s, v27.4h, v17.4h\n"
+    "smlal2 v2.4s, v27.8h, v17.8h\n"
+    "tbz x16, #2, 17f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 16f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 19f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 19f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x16, #1, 18f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 19f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 19f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (3, 3): Bit 2: End
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ldr x20, [x12, #0x38]\n"
+    "smlal v6.4s, v23.4h, v4.4h\n"
+    "smlal2 v2.4s, v23.8h, v4.8h\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 21f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 20f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 23f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 23f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 1): Bit 2: Unset
+    "tbz x16, #1, 22f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 23f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 23f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (0, 1): Bit 2: End
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "ldr x20, [x12, #0x40]\n"
+    "smlal v28.4s, v21.4h, v7.4h\n"
+    "smlal2 v9.4s, v21.8h, v7.8h\n"
+    "smlal v3.4s, v21.4h, v19.4h\n"
+    "smlal2 v30.4s, v21.8h, v19.8h\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 25f\n"
+    "ld1 { v18.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 24f\n"
+    "ld1 { v18.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 27f\n"
+    "ld1 { v18.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 27f\n"
+    "ld1 { v18.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (0, 2): Bit 2: Unset
+    "tbz x16, #1, 26f\n"
+    "ld1 { v18.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 27f\n"
+    "ld1 { v18.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 27f\n"
+    "ld1 { v18.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (0, 2): Bit 2: End
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "ldr x20, [x12, #0x48]\n"
+    "smlal v28.4s, v18.4h, v1.4h\n"
+    "smlal2 v9.4s, v18.8h, v1.8h\n"
+    "smlal v3.4s, v18.4h, v7.4h\n"
+    "smlal2 v30.4s, v18.8h, v7.8h\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 29f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 28f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 31f\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 31f\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x16, #1, 30f\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 31f\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 31f\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "ldr x20, [x12, #0x50]\n"
+    "smlal v28.4s, v15.4h, v4.4h\n"
+    "smlal2 v9.4s, v15.8h, v4.8h\n"
+    "smlal v3.4s, v15.4h, v16.4h\n"
+    "smlal2 v30.4s, v15.8h, v16.8h\n"
+    "add x20, x20, x14\n"
+    "smlal v0.4s, v15.4h, v31.4h\n"
+    "smlal2 v22.4s, v15.8h, v31.8h\n"
+    "smlal v6.4s, v15.4h, v8.4h\n"
+    "smlal2 v2.4s, v15.8h, v8.8h\n"
+    "tbz x16, #2, 33f\n"
+    "ld1 { v20.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 32f\n"
+    "ld1 { v20.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 35f\n"
+    "ld1 { v20.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 35f\n"
+    "ld1 { v20.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (1, 0): Bit 2: Unset
+    "tbz x16, #1, 34f\n"
+    "ld1 { v20.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 35f\n"
+    "ld1 { v20.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 35f\n"
+    "ld1 { v20.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (1, 0): Bit 2: End
+    "ushll v20.8h, v20.8b, #0x0\n"
+    "ldr x20, [x12, #0x58]\n"
+    "smlal v28.4s, v20.4h, v17.4h\n"
+    "smlal2 v9.4s, v20.8h, v17.8h\n"
+    "smlal v0.4s, v20.4h, v19.4h\n"
+    "smlal2 v22.4s, v20.8h, v19.8h\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 37f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 36f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 39f\n"
+    "ld1 { v11.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 39f\n"
+    "ld1 { v11.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x16, #1, 38f\n"
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 39f\n"
+    "ld1 { v11.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 39f\n"
+    "ld1 { v11.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (1, 3): Bit 2: End
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "ldr x20, [x12, #0x60]\n"
+    "smlal v3.4s, v11.4h, v31.4h\n"
+    "smlal2 v30.4s, v11.8h, v31.8h\n"
+    "smlal v6.4s, v11.4h, v1.4h\n"
+    "smlal2 v2.4s, v11.8h, v1.8h\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 41f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 40f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 43f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 43f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x16, #1, 42f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 43f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 43f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (2, 0): Bit 2: End
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ldr x20, [x12, #0x68]\n"
+    "smlal v28.4s, v23.4h, v29.4h\n"
+    "smlal2 v9.4s, v23.8h, v29.8h\n"
+    "smlal v0.4s, v23.4h, v17.4h\n"
+    "smlal2 v22.4s, v23.8h, v17.8h\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 45f\n"
+    "ld1 { v20.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 44f\n"
+    "ld1 { v20.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 47f\n"
+    "ld1 { v20.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 47f\n"
+    "ld1 { v20.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x16, #1, 46f\n"
+    "ld1 { v20.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 47f\n"
+    "ld1 { v20.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 47f\n"
+    "ld1 { v20.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "ushll v20.8h, v20.8b, #0x0\n"
+    "ldr x20, [x12, #0x70]\n"
+    "smlal v3.4s, v20.4h, v4.4h\n"
+    "smlal2 v30.4s, v20.8h, v4.8h\n"
+    "smlal v6.4s, v20.4h, v31.4h\n"
+    "smlal2 v2.4s, v20.8h, v31.8h\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 49f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 48f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 51f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 51f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x16, #1, 50f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 51f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 51f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "ushll v8.8h, v8.8b, #0x0\n"
+    "ldr x20, [x12, #0x78]\n"
+    "smlal v0.4s, v8.4h, v16.4h\n"
+    "smlal2 v22.4s, v8.8h, v16.8h\n"
+    "smlal v6.4s, v8.4h, v29.4h\n"
+    "smlal2 v2.4s, v8.8h, v29.8h\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 53f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 52f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 55f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 55f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x16, #1, 54f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 55f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 55f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "ushll v8.8h, v8.8b, #0x0\n"
+    "smlal v0.4s, v8.4h, v4.4h\n"
+    "smlal2 v22.4s, v8.8h, v4.8h\n"
+    "smlal v6.4s, v8.4h, v16.4h\n"
+    "smlal2 v2.4s, v8.8h, v16.8h\n"
+    "tbz x16, #2, 57f\n"
+    "ld1 { v7.4s }, [x10], #0x10\n"
+    "ld1 { v23.4s }, [x9], #0x10\n"
+    "tbz x16, #1, 56f\n"
+    "ld1 { v11.d }[0], [x10], #0x8\n"
+    "ld1 { v27.d }[0], [x9], #0x8\n"
+    "tbz x16, #0, 59f\n"
+    "ld1 { v11.s }[2], [x10]\n"
+    "ld1 { v27.s }[2], [x9]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x16, #0, 59f\n"
+    "ld1 { v11.s }[0], [x10]\n"
+    "ld1 { v27.s }[0], [x9]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x16, #1, 58f\n"
+    "ld1 { v7.d }[0], [x10], #0x8\n"
+    "ld1 { v23.d }[0], [x9], #0x8\n"
+    "tbz x16, #0, 59f\n"
+    "ld1 { v7.s }[2], [x10]\n"
+    "ld1 { v23.s }[2], [x9]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 59f\n"
+    "ld1 { v7.s }[0], [x10]\n"
+    "ld1 { v23.s }[0], [x9]\n"
+    "59:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v28.4s, v28.4s, v7.4s\n"
+    "and v20.16b, v28.16b, v23.16b\n"
+    "add x28, x28, x13\n"
+    "add x27, x27, x13\n"
+    "sqrdmulh v9.4s, v9.4s, v11.4s\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "add x26, x26, x13\n"
+    "add x25, x25, x13\n"
+    "and v4.16b, v9.16b, v27.16b\n"
+    "sqrdmulh v3.4s, v3.4s, v7.4s\n"
+    "sqrdmulh v0.4s, v0.4s, v7.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v7.4s\n"
+    "sqadd v28.4s, v28.4s, v20.4s\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "and v19.16b, v3.16b, v23.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v11.4s\n"
+    "and v29.16b, v0.16b, v23.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v11.4s\n"
+    "and v26.16b, v6.16b, v23.16b\n"
+    "sqrdmulh v2.4s, v2.4s, v11.4s\n"
+    "sqadd v9.4s, v9.4s, v4.4s\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "and v17.16b, v30.16b, v27.16b\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "and v8.16b, v22.16b, v27.16b\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "and v13.16b, v2.16b, v27.16b\n"
+    "sqadd v3.4s, v3.4s, v19.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v0.4s, v0.4s, v29.4s\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "sqadd v6.4s, v6.4s, v26.4s\n"
+    "sshr v13.4s, v13.4s, #0x1f\n"
+    "srshl v28.4s, v28.4s, v23.4s\n"
+    "srshl v3.4s, v3.4s, v23.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "srshl v0.4s, v0.4s, v23.4s\n"
+    "sqadd v22.4s, v22.4s, v8.4s\n"
+    "srshl v6.4s, v6.4s, v23.4s\n"
+    "sqadd v2.4s, v2.4s, v13.4s\n"
+    "srshl v9.4s, v9.4s, v27.4s\n"
+    "sqxtn v28.4h, v28.4s\n"
+    "srshl v30.4s, v30.4s, v27.4s\n"
+    "sqxtn v3.4h, v3.4s\n"
+    "srshl v22.4s, v22.4s, v27.4s\n"
+    "sqxtn v0.4h, v0.4s\n"
+    "srshl v2.4s, v2.4s, v27.4s\n"
+    "sqxtn v6.4h, v6.4s\n"
+    "sqxtn2 v28.8h, v9.4s\n"
+    "sqxtn2 v3.8h, v30.4s\n"
+    "sqxtn2 v0.8h, v22.4s\n"
+    "sqxtn2 v6.8h, v2.4s\n"
+    "sqadd v28.8h, v28.8h, v5.8h\n"
+    "sqadd v3.8h, v3.8h, v5.8h\n"
+    "sqadd v0.8h, v0.8h, v5.8h\n"
+    "sqadd v6.8h, v6.8h, v5.8h\n"
+    "smax v28.8h, v28.8h, v14.8h\n"
+    "smax v3.8h, v3.8h, v14.8h\n"
+    "smax v0.8h, v0.8h, v14.8h\n"
+    "smax v6.8h, v6.8h, v14.8h\n"
+    "smin v28.8h, v28.8h, v12.8h\n"
+    "smin v3.8h, v3.8h, v12.8h\n"
+    "smin v0.8h, v0.8h, v12.8h\n"
+    "smin v6.8h, v6.8h, v12.8h\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v3.16b, v3.16b, v3.16b\n"
+    "uzp1 v0.16b, v0.16b, v0.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "tbz x16, #2, 61f\n"
+    "st1 { v28.s }[0], [x28], #0x4\n"
+    "st1 { v3.s }[0], [x27], #0x4\n"
+    "st1 { v0.s }[0], [x26], #0x4\n"
+    "st1 { v6.s }[0], [x25], #0x4\n"
+    "tbz x16, #1, 60f\n"
+    "st1 { v28.h }[2], [x28], #0x2\n"
+    "st1 { v3.h }[2], [x27], #0x2\n"
+    "st1 { v0.h }[2], [x26], #0x2\n"
+    "st1 { v6.h }[2], [x25], #0x2\n"
+    "tbz x16, #0, 63f\n"
+    "st1 { v28.b }[6], [x28], #0x1\n"
+    "st1 { v3.b }[6], [x27], #0x1\n"
+    "st1 { v0.b }[6], [x26], #0x1\n"
+    "st1 { v6.b }[6], [x25], #0x1\n"
+    "b 63f\n"
+    "60:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x16, #0, 63f\n"
+    "st1 { v28.b }[4], [x28], #0x1\n"
+    "st1 { v3.b }[4], [x27], #0x1\n"
+    "st1 { v0.b }[4], [x26], #0x1\n"
+    "st1 { v6.b }[4], [x25], #0x1\n"
+    "b 63f\n"
+    "61:"  // Oddments: Bit 2: Unset
+    "tbz x16, #1, 62f\n"
+    "st1 { v28.h }[0], [x28], #0x2\n"
+    "st1 { v3.h }[0], [x27], #0x2\n"
+    "st1 { v0.h }[0], [x26], #0x2\n"
+    "st1 { v6.h }[0], [x25], #0x2\n"
+    "tbz x16, #0, 63f\n"
+    "st1 { v28.b }[2], [x28], #0x1\n"
+    "st1 { v3.b }[2], [x27], #0x1\n"
+    "st1 { v0.b }[2], [x26], #0x1\n"
+    "st1 { v6.b }[2], [x25], #0x1\n"
+    "b 63f\n"
+    "62:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 63f\n"
+    "st1 { v28.b }[0], [x28], #0x1\n"
+    "st1 { v3.b }[0], [x27], #0x1\n"
+    "st1 { v0.b }[0], [x26], #0x1\n"
+    "st1 { v6.b }[0], [x25], #0x1\n"
+    "63:"  // Oddments: Bit 2: End
+    "64:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..9b646bc4f6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const uint8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..6cb10a7bb2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1395 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x8, x7, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v6.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_minval]\n"
+    "ld1r { v22.8h }, [x21]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "mov x17, #0x0\n"
+    "ld1r { v5.8h }, [x20]\n"
+    "mov x16, #0x0\n"
+    "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x11, x10, [x22, #0x0]\n"
+    "ldp x9, x28, [x22, #0x10]\n"
+    "cbz x8, 3f\n"
+    "ldr d12, [x14, #0x0]\n"
+    "ldr d11, [x14, #0x8]\n"
+    "subs x8, x8, #0x1\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "ldr d25, [x14, #0x10]\n"
+    "ldr d24, [x14, #0x18]\n"
+    "usubl v11.8h, v11.8b, v6.8b\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "ldr d23, [x14, #0x20]\n"
+    "ldr d7, [x14, #0x28]\n"
+    "usubl v24.8h, v24.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "ldr d3, [x14, #0x30]\n"
+    "ldr d9, [x14, #0x38]\n"
+    "usubl v7.8h, v7.8b, v6.8b\n"
+    "usubl v3.8h, v3.8b, v6.8b\n"
+    "ldr d30, [x14, #0x40]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "usubl v9.8h, v9.8b, v6.8b\n"
+    "usubl v30.8h, v30.8b, v6.8b\n"
+    "ldr q8, [x20, #0x0]\n"
+    "ldr q2, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "mov v21.16b, v8.16b\n"
+    "mov v4.16b, v2.16b\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "mov v20.16b, v8.16b\n"
+    "mov v1.16b, v2.16b\n"
+    "ldr d26, [x27, x17]\n"
+    "ldr d18, [x26, x17]\n"
+    "mov v16.16b, v8.16b\n"
+    "mov v14.16b, v2.16b\n"
+    "ldr d10, [x25, x17]\n"
+    "ldr d27, [x24, x17]\n"
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "ldr d17, [x23, x17]\n"
+    "ldr d19, [x22, x17]\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "ldr d15, [x21, x17]\n"
+    "ldr d28, [x20, x17]\n"
+    "ushll v17.8h, v17.8b, #0x0\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr q31, [x13, #0x0]\n"
+    "ldr q0, [x12, #0x0]\n"
+    "smlal v8.4s, v26.4h, v30.4h\n"
+    "smlal2 v2.4s, v26.8h, v30.8h\n"
+    "ldr q29, [x13, #0x10]\n"
+    "ldr x21, [x15, #0x58]\n"
+    "smlal v8.4s, v18.4h, v12.4h\n"
+    "smlal v21.4s, v26.4h, v3.4h\n"
+    "ldr x20, [x15, #0x78]\n"
+    "ldr x25, [x15, #0x60]\n"
+    "smlal v20.4s, v26.4h, v25.4h\n"
+    "smlal v16.4s, v26.4h, v12.4h\n"
+    "ldr x24, [x15, #0x80]\n"
+    "smlal2 v2.4s, v18.8h, v12.8h\n"
+    "ldr d18, [x21, x17]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "smlal v8.4s, v10.4h, v11.4h\n"
+    "smlal2 v4.4s, v26.8h, v3.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x88]\n"
+    "smlal2 v1.4s, v26.8h, v25.8h\n"
+    "smlal2 v14.4s, v26.8h, v12.8h\n"
+    "ldr d26, [x20, x17]\n"
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "smlal v21.4s, v27.4h, v11.4h\n"
+    "smlal v20.4s, v18.4h, v24.4h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v16.4s, v26.4h, v23.4h\n"
+    "smlal2 v2.4s, v10.8h, v11.8h\n"
+    "ldr d10, [x25, x17]\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal v8.4s, v19.4h, v24.4h\n"
+    "smlal2 v4.4s, v27.8h, v11.8h\n"
+    "ldr d27, [x24, x17]\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "smlal2 v1.4s, v18.8h, v24.8h\n"
+    "ldr d18, [x23, x17]\n"
+    "smlal2 v14.4s, v26.8h, v23.8h\n"
+    "ldr d26, [x22, x17]\n"
+    "ldr x24, [x15, #0x98]\n"
+    "smlal v21.4s, v17.4h, v25.4h\n"
+    "smlal v20.4s, v10.4h, v12.4h\n"
+    "ldr x23, [x15, #0x50]\n"
+    "smlal v16.4s, v27.4h, v11.4h\n"
+    "smlal2 v2.4s, v19.8h, v24.8h\n"
+    "ldr d19, [x21, x17]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "smlal v8.4s, v15.4h, v23.4h\n"
+    "smlal2 v4.4s, v17.8h, v25.8h\n"
+    "ldr d17, [x20, x17]\n"
+    "ldr x22, [x15, #0x48]\n"
+    "smlal2 v1.4s, v10.8h, v12.8h\n"
+    "smlal2 v14.4s, v27.8h, v11.8h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v21.4s, v28.4h, v12.4h\n"
+    "smlal v20.4s, v18.4h, v23.4h\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "smlal v16.4s, v26.4h, v7.4h\n"
+    "smlal2 v2.4s, v15.8h, v23.8h\n"
+    "ldr d15, [x24, x17]\n"
+    "ushll v17.8h, v17.8b, #0x0\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal v8.4s, v28.4h, v25.4h\n"
+    "smlal2 v4.4s, v28.8h, v12.8h\n"
+    "ldr d12, [x23, x17]\n"
+    "ushll v12.8h, v12.8b, #0x0\n"
+    "smlal2 v1.4s, v18.8h, v23.8h\n"
+    "ldr d18, [x22, x17]\n"
+    "smlal2 v14.4s, v26.8h, v7.8h\n"
+    "ldr d26, [x21, x17]\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "smlal v21.4s, v19.4h, v23.4h\n"
+    "smlal v20.4s, v17.4h, v11.4h\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "smlal v16.4s, v15.4h, v25.4h\n"
+    "smlal2 v2.4s, v28.8h, v25.8h\n"
+    "ldr d28, [x20, x17]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "smlal v8.4s, v12.4h, v7.4h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "smlal2 v4.4s, v19.8h, v23.8h\n"
+    "ldr d23, [x22, x17]\n"
+    "ldr d19, [x21, x17]\n"
+    "smlal2 v1.4s, v17.8h, v11.8h\n"
+    "ldr d11, [x20, x17]\n"
+    "smlal2 v14.4s, v15.8h, v25.8h\n"
+    "ldr q25, [x12, #0x10]\n"
+    "smlal v21.4s, v18.4h, v7.4h\n"
+    "smlal v20.4s, v26.4h, v3.4h\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ldr x21, [x15, #0xc0]\n"
+    "smlal v16.4s, v28.4h, v24.4h\n"
+    "smlal2 v2.4s, v12.8h, v7.8h\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal v8.4s, v10.4h, v3.4h\n"
+    "smlal2 v4.4s, v18.8h, v7.8h\n"
+    "ldr d18, [x21, x17]\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "smlal2 v1.4s, v26.8h, v3.8h\n"
+    "smlal2 v14.4s, v28.8h, v24.8h\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "add x14, x14, #0x48\n"
+    "smlal v21.4s, v12.4h, v24.4h\n"
+    "smlal v20.4s, v23.4h, v9.4h\n"
+    "add x17, x17, #0x8\n"
+    "subs x8, x8, #0x1\n"
+    "smlal v16.4s, v19.4h, v9.4h\n"
+    "smlal2 v2.4s, v10.8h, v3.8h\n"
+    "add x13, x13, #0x20\n"
+    "add x12, x12, #0x20\n"
+    "smlal v8.4s, v17.4h, v9.4h\n"
+    "smlal2 v4.4s, v12.8h, v24.8h\n"
+    "sqrdmulh v8.4s, v8.4s, v31.4s\n"
+    "smlal2 v1.4s, v23.8h, v9.8h\n"
+    "smlal2 v14.4s, v19.8h, v9.8h\n"
+    "and v10.16b, v8.16b, v0.16b\n"
+    "smlal v21.4s, v27.4h, v9.4h\n"
+    "smlal v20.4s, v28.4h, v7.4h\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "smlal v16.4s, v11.4h, v3.4h\n"
+    "smlal2 v2.4s, v17.8h, v9.8h\n"
+    "sqrdmulh v2.4s, v2.4s, v29.4s\n"
+    "smlal2 v4.4s, v27.8h, v9.8h\n"
+    "smlal2 v1.4s, v28.8h, v7.8h\n"
+    "and v12.16b, v2.16b, v25.16b\n"
+    "smlal2 v14.4s, v11.8h, v3.8h\n"
+    "smlal v21.4s, v15.4h, v30.4h\n"
+    "sqrdmulh v21.4s, v21.4s, v31.4s\n"
+    "smlal v20.4s, v11.4h, v30.4h\n"
+    "smlal v16.4s, v18.4h, v30.4h\n"
+    "sqrdmulh v20.4s, v20.4s, v31.4s\n"
+    "smlal2 v4.4s, v15.8h, v30.8h\n"
+    "smlal2 v1.4s, v11.8h, v30.8h\n"
+    "sqrdmulh v16.4s, v16.4s, v31.4s\n"
+    "smlal2 v14.4s, v18.8h, v30.8h\n"
+    "sqadd v8.4s, v8.4s, v10.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "and v27.16b, v21.16b, v0.16b\n"
+    "sqrdmulh v4.4s, v4.4s, v29.4s\n"
+    "and v24.16b, v20.16b, v0.16b\n"
+    "sqrdmulh v1.4s, v1.4s, v29.4s\n"
+    "and v19.16b, v16.16b, v0.16b\n"
+    "sqrdmulh v14.4s, v14.4s, v29.4s\n"
+    "sqadd v2.4s, v2.4s, v12.4s\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "and v18.16b, v4.16b, v25.16b\n"
+    "sshr v24.4s, v24.4s, #0x1f\n"
+    "and v17.16b, v1.16b, v25.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "and v15.16b, v14.16b, v25.16b\n"
+    "sqadd v21.4s, v21.4s, v27.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v24.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v16.4s, v16.4s, v19.4s\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "srshl v8.4s, v8.4s, v0.4s\n"
+    "srshl v21.4s, v21.4s, v0.4s\n"
+    "sqadd v4.4s, v4.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v0.4s\n"
+    "sqadd v1.4s, v1.4s, v17.4s\n"
+    "srshl v16.4s, v16.4s, v0.4s\n"
+    "sqadd v14.4s, v14.4s, v15.4s\n"
+    "srshl v2.4s, v2.4s, v25.4s\n"
+    "sqxtn v8.4h, v8.4s\n"
+    "srshl v4.4s, v4.4s, v25.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v1.4s, v1.4s, v25.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v14.4s, v14.4s, v25.4s\n"
+    "sqxtn v16.4h, v16.4s\n"
+    "sqxtn2 v8.8h, v2.4s\n"
+    "sqxtn2 v21.8h, v4.4s\n"
+    "sqxtn2 v20.8h, v1.4s\n"
+    "sqxtn2 v16.8h, v14.4s\n"
+    "sqadd v8.8h, v8.8h, v22.8h\n"
+    "sqadd v21.8h, v21.8h, v22.8h\n"
+    "sqadd v20.8h, v20.8h, v22.8h\n"
+    "sqadd v16.8h, v16.8h, v22.8h\n"
+    "smax v8.8h, v8.8h, v13.8h\n"
+    "smax v21.8h, v21.8h, v13.8h\n"
+    "smax v20.8h, v20.8h, v13.8h\n"
+    "smax v16.8h, v16.8h, v13.8h\n"
+    "smin v8.8h, v8.8h, v5.8h\n"
+    "smin v21.8h, v21.8h, v5.8h\n"
+    "smin v20.8h, v20.8h, v5.8h\n"
+    "smin v16.8h, v16.8h, v5.8h\n"
+    "uzp1 v8.16b, v8.16b, v8.16b\n"
+    "str d8, [x11, x16]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d21, [x10, x16]\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str d20, [x9, x16]\n"
+    "str d16, [x28, x16]\n"
+    "ldr q8, [x20, #0x0]\n"
+    "ldr q2, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d12, [x14, #0x0]\n"
+    "ldr d11, [x14, #0x8]\n"
+    "add x16, x16, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d25, [x14, #0x10]\n"
+    "ldr d24, [x14, #0x18]\n"
+    "mov v21.16b, v8.16b\n"
+    "mov v4.16b, v2.16b\n"
+    "ldr d23, [x14, #0x20]\n"
+    "ldr d7, [x14, #0x28]\n"
+    "mov v20.16b, v8.16b\n"
+    "mov v1.16b, v2.16b\n"
+    "ldr d3, [x14, #0x30]\n"
+    "ldr d9, [x14, #0x38]\n"
+    "mov v16.16b, v8.16b\n"
+    "mov v14.16b, v2.16b\n"
+    "ldr d30, [x14, #0x40]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v11.8h, v11.8b, v6.8b\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "usubl v24.8h, v24.8b, v6.8b\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ldr d26, [x27, x17]\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "usubl v7.8h, v7.8b, v6.8b\n"
+    "ldr d18, [x26, x17]\n"
+    "ldr d10, [x25, x17]\n"
+    "usubl v3.8h, v3.8b, v6.8b\n"
+    "usubl v9.8h, v9.8b, v6.8b\n"
+    "ldr d27, [x24, x17]\n"
+    "ldr d17, [x23, x17]\n"
+    "usubl v30.8h, v30.8b, v6.8b\n"
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "ldr d19, [x22, x17]\n"
+    "ldr d15, [x21, x17]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ldr d28, [x20, x17]\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "ushll v17.8h, v17.8b, #0x0\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q0, [x13, #0x0]\n"
+    "ldr q31, [x12, #0x0]\n"
+    "smlal v8.4s, v26.4h, v30.4h\n"
+    "smlal2 v2.4s, v26.8h, v30.8h\n"
+    "ldr q29, [x13, #0x10]\n"
+    "ldr x21, [x15, #0x58]\n"
+    "smlal v8.4s, v18.4h, v12.4h\n"
+    "smlal v21.4s, v26.4h, v3.4h\n"
+    "ldr x20, [x15, #0x78]\n"
+    "ldr x25, [x15, #0x60]\n"
+    "smlal v20.4s, v26.4h, v25.4h\n"
+    "smlal v16.4s, v26.4h, v12.4h\n"
+    "ldr x24, [x15, #0x80]\n"
+    "smlal2 v2.4s, v18.8h, v12.8h\n"
+    "ldr d18, [x21, x17]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "smlal v8.4s, v10.4h, v11.4h\n"
+    "smlal2 v4.4s, v26.8h, v3.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x88]\n"
+    "smlal2 v1.4s, v26.8h, v25.8h\n"
+    "smlal2 v14.4s, v26.8h, v12.8h\n"
+    "ldr d26, [x20, x17]\n"
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "smlal v21.4s, v27.4h, v11.4h\n"
+    "smlal v20.4s, v18.4h, v24.4h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v16.4s, v26.4h, v23.4h\n"
+    "smlal2 v2.4s, v10.8h, v11.8h\n"
+    "ldr d10, [x25, x17]\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal v8.4s, v19.4h, v24.4h\n"
+    "smlal2 v4.4s, v27.8h, v11.8h\n"
+    "ldr d27, [x24, x17]\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "smlal2 v1.4s, v18.8h, v24.8h\n"
+    "ldr d18, [x23, x17]\n"
+    "smlal2 v14.4s, v26.8h, v23.8h\n"
+    "ldr d26, [x22, x17]\n"
+    "ldr x24, [x15, #0x98]\n"
+    "smlal v21.4s, v17.4h, v25.4h\n"
+    "smlal v20.4s, v10.4h, v12.4h\n"
+    "ldr x23, [x15, #0x50]\n"
+    "smlal v16.4s, v27.4h, v11.4h\n"
+    "smlal2 v2.4s, v19.8h, v24.8h\n"
+    "ldr d19, [x21, x17]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "smlal v8.4s, v15.4h, v23.4h\n"
+    "smlal2 v4.4s, v17.8h, v25.8h\n"
+    "ldr d17, [x20, x17]\n"
+    "ldr x22, [x15, #0x48]\n"
+    "smlal2 v1.4s, v10.8h, v12.8h\n"
+    "smlal2 v14.4s, v27.8h, v11.8h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v21.4s, v28.4h, v12.4h\n"
+    "smlal v20.4s, v18.4h, v23.4h\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "smlal v16.4s, v26.4h, v7.4h\n"
+    "smlal2 v2.4s, v15.8h, v23.8h\n"
+    "ldr d15, [x24, x17]\n"
+    "ushll v17.8h, v17.8b, #0x0\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal v8.4s, v28.4h, v25.4h\n"
+    "smlal2 v4.4s, v28.8h, v12.8h\n"
+    "ldr d12, [x23, x17]\n"
+    "ushll v12.8h, v12.8b, #0x0\n"
+    "smlal2 v1.4s, v18.8h, v23.8h\n"
+    "ldr d18, [x22, x17]\n"
+    "smlal2 v14.4s, v26.8h, v7.8h\n"
+    "ldr d26, [x21, x17]\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "smlal v21.4s, v19.4h, v23.4h\n"
+    "smlal v20.4s, v17.4h, v11.4h\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "smlal v16.4s, v15.4h, v25.4h\n"
+    "smlal2 v2.4s, v28.8h, v25.8h\n"
+    "ldr d28, [x20, x17]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "smlal v8.4s, v12.4h, v7.4h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "smlal2 v4.4s, v19.8h, v23.8h\n"
+    "ldr d23, [x22, x17]\n"
+    "ldr d19, [x21, x17]\n"
+    "smlal2 v1.4s, v17.8h, v11.8h\n"
+    "ldr d11, [x20, x17]\n"
+    "smlal2 v14.4s, v15.8h, v25.8h\n"
+    "ldr q25, [x12, #0x10]\n"
+    "smlal v21.4s, v18.4h, v7.4h\n"
+    "smlal v20.4s, v26.4h, v3.4h\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "smlal v16.4s, v28.4h, v24.4h\n"
+    "smlal2 v2.4s, v12.8h, v7.8h\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "tst x7, #0x7\n"
+    "smlal v8.4s, v10.4h, v3.4h\n"
+    "smlal2 v4.4s, v18.8h, v7.8h\n"
+    "ldr d18, [x20, x17]\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "smlal2 v1.4s, v26.8h, v3.8h\n"
+    "smlal2 v14.4s, v28.8h, v24.8h\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "add x17, x17, #0x8\n"
+    "smlal v21.4s, v12.4h, v24.4h\n"
+    "smlal v20.4s, v23.4h, v9.4h\n"
+    "add x13, x13, #0x20\n"
+    "add x12, x12, #0x20\n"
+    "smlal v16.4s, v19.4h, v9.4h\n"
+    "smlal2 v2.4s, v10.8h, v3.8h\n"
+    "smlal v8.4s, v17.4h, v9.4h\n"
+    "smlal2 v4.4s, v12.8h, v24.8h\n"
+    "sqrdmulh v8.4s, v8.4s, v0.4s\n"
+    "smlal2 v1.4s, v23.8h, v9.8h\n"
+    "smlal2 v14.4s, v19.8h, v9.8h\n"
+    "and v23.16b, v8.16b, v31.16b\n"
+    "smlal v21.4s, v27.4h, v9.4h\n"
+    "smlal v20.4s, v28.4h, v7.4h\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "smlal v16.4s, v11.4h, v3.4h\n"
+    "smlal2 v2.4s, v17.8h, v9.8h\n"
+    "sqrdmulh v2.4s, v2.4s, v29.4s\n"
+    "smlal2 v4.4s, v27.8h, v9.8h\n"
+    "smlal2 v1.4s, v28.8h, v7.8h\n"
+    "and v7.16b, v2.16b, v25.16b\n"
+    "smlal2 v14.4s, v11.8h, v3.8h\n"
+    "smlal v21.4s, v15.4h, v30.4h\n"
+    "sqrdmulh v21.4s, v21.4s, v0.4s\n"
+    "smlal v20.4s, v11.4h, v30.4h\n"
+    "smlal v16.4s, v18.4h, v30.4h\n"
+    "sqrdmulh v20.4s, v20.4s, v0.4s\n"
+    "smlal2 v4.4s, v15.8h, v30.8h\n"
+    "smlal2 v1.4s, v11.8h, v30.8h\n"
+    "sqrdmulh v16.4s, v16.4s, v0.4s\n"
+    "smlal2 v14.4s, v18.8h, v30.8h\n"
+    "sqadd v8.4s, v8.4s, v23.4s\n"
+    "sshr v7.4s, v7.4s, #0x1f\n"
+    "and v23.16b, v21.16b, v31.16b\n"
+    "sqrdmulh v4.4s, v4.4s, v29.4s\n"
+    "and v24.16b, v20.16b, v31.16b\n"
+    "sqrdmulh v1.4s, v1.4s, v29.4s\n"
+    "and v19.16b, v16.16b, v31.16b\n"
+    "sqrdmulh v14.4s, v14.4s, v29.4s\n"
+    "sqadd v2.4s, v2.4s, v7.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v18.16b, v4.16b, v25.16b\n"
+    "sshr v24.4s, v24.4s, #0x1f\n"
+    "and v17.16b, v1.16b, v25.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "and v15.16b, v14.16b, v25.16b\n"
+    "sqadd v21.4s, v21.4s, v23.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v24.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v16.4s, v16.4s, v19.4s\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "srshl v8.4s, v8.4s, v31.4s\n"
+    "srshl v21.4s, v21.4s, v31.4s\n"
+    "sqadd v4.4s, v4.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v31.4s\n"
+    "sqadd v1.4s, v1.4s, v17.4s\n"
+    "srshl v16.4s, v16.4s, v31.4s\n"
+    "sqadd v14.4s, v14.4s, v15.4s\n"
+    "srshl v2.4s, v2.4s, v25.4s\n"
+    "sqxtn v8.4h, v8.4s\n"
+    "srshl v4.4s, v4.4s, v25.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v1.4s, v1.4s, v25.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v14.4s, v14.4s, v25.4s\n"
+    "sqxtn v16.4h, v16.4s\n"
+    "sqxtn2 v8.8h, v2.4s\n"
+    "sqxtn2 v21.8h, v4.4s\n"
+    "sqxtn2 v20.8h, v1.4s\n"
+    "sqxtn2 v16.8h, v14.4s\n"
+    "sqadd v8.8h, v8.8h, v22.8h\n"
+    "sqadd v21.8h, v21.8h, v22.8h\n"
+    "sqadd v20.8h, v20.8h, v22.8h\n"
+    "sqadd v16.8h, v16.8h, v22.8h\n"
+    "smax v8.8h, v8.8h, v13.8h\n"
+    "smax v21.8h, v21.8h, v13.8h\n"
+    "smax v20.8h, v20.8h, v13.8h\n"
+    "smax v16.8h, v16.8h, v13.8h\n"
+    "smin v8.8h, v8.8h, v5.8h\n"
+    "smin v21.8h, v21.8h, v5.8h\n"
+    "smin v20.8h, v20.8h, v5.8h\n"
+    "smin v16.8h, v16.8h, v5.8h\n"
+    "uzp1 v8.16b, v8.16b, v8.16b\n"
+    "str d8, [x11, x16]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d21, [x10, x16]\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str d20, [x9, x16]\n"
+    "str d16, [x28, x16]\n"
+    "add x16, x16, #0x8\n"
+    "beq 88f\n"
+    "add x14, x14, #0x48\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x7, #2, 5f\n"
+    "ld1 { v8.4s }, [x20], #0x10\n"
+    "tbz x7, #1, 4f\n"
+    "ld1 { v2.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v2.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v2.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x7, #1, 6f\n"
+    "ld1 { v8.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v8.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v8.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d12, [x14, #0x0]\n"
+    "ldr d11, [x14, #0x8]\n"
+    "mov v21.16b, v8.16b\n"
+    "mov v4.16b, v2.16b\n"
+    "ldr d25, [x14, #0x10]\n"
+    "ldr d24, [x14, #0x18]\n"
+    "mov v20.16b, v8.16b\n"
+    "mov v1.16b, v2.16b\n"
+    "ldr d23, [x14, #0x20]\n"
+    "ldr d7, [x14, #0x28]\n"
+    "mov v16.16b, v8.16b\n"
+    "mov v14.16b, v2.16b\n"
+    "ldr d3, [x14, #0x30]\n"
+    "ldr d9, [x14, #0x38]\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v11.8h, v11.8b, v6.8b\n"
+    "ldr d30, [x14, #0x40]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "usubl v24.8h, v24.8b, v6.8b\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "usubl v7.8h, v7.8b, v6.8b\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "usubl v3.8h, v3.8b, v6.8b\n"
+    "usubl v9.8h, v9.8b, v6.8b\n"
+    "usubl v30.8h, v30.8b, v6.8b\n"
+    "add x27, x27, x17\n"
+    "add x26, x26, x17\n"
+    "add x25, x25, x17\n"
+    "add x24, x24, x17\n"
+    "add x23, x23, x17\n"
+    "add x22, x22, x17\n"
+    "add x21, x21, x17\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 9f\n"
+    "ld1 { v26.s }[0], [x27], #0x4\n"
+    "ld1 { v18.s }[0], [x26], #0x4\n"
+    "ld1 { v10.s }[0], [x25], #0x4\n"
+    "ld1 { v27.s }[0], [x24], #0x4\n"
+    "ld1 { v17.s }[0], [x23], #0x4\n"
+    "ld1 { v19.s }[0], [x22], #0x4\n"
+    "ld1 { v15.s }[0], [x21], #0x4\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 8f\n"
+    "ld1 { v26.h }[2], [x27], #0x2\n"
+    "ld1 { v18.h }[2], [x26], #0x2\n"
+    "ld1 { v10.h }[2], [x25], #0x2\n"
+    "ld1 { v27.h }[2], [x24], #0x2\n"
+    "ld1 { v17.h }[2], [x23], #0x2\n"
+    "ld1 { v19.h }[2], [x22], #0x2\n"
+    "ld1 { v15.h }[2], [x21], #0x2\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v26.b }[6], [x27]\n"
+    "ld1 { v18.b }[6], [x26]\n"
+    "ld1 { v10.b }[6], [x25]\n"
+    "ld1 { v27.b }[6], [x24]\n"
+    "ld1 { v17.b }[6], [x23]\n"
+    "ld1 { v19.b }[6], [x22]\n"
+    "ld1 { v15.b }[6], [x21]\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v26.b }[4], [x27]\n"
+    "ld1 { v18.b }[4], [x26]\n"
+    "ld1 { v10.b }[4], [x25]\n"
+    "ld1 { v27.b }[4], [x24]\n"
+    "ld1 { v17.b }[4], [x23]\n"
+    "ld1 { v19.b }[4], [x22]\n"
+    "ld1 { v15.b }[4], [x21]\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x7, #1, 10f\n"
+    "ld1 { v26.h }[0], [x27], #0x2\n"
+    "ld1 { v18.h }[0], [x26], #0x2\n"
+    "ld1 { v10.h }[0], [x25], #0x2\n"
+    "ld1 { v27.h }[0], [x24], #0x2\n"
+    "ld1 { v17.h }[0], [x23], #0x2\n"
+    "ld1 { v19.h }[0], [x22], #0x2\n"
+    "ld1 { v15.h }[0], [x21], #0x2\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v26.b }[2], [x27]\n"
+    "ld1 { v18.b }[2], [x26]\n"
+    "ld1 { v10.b }[2], [x25]\n"
+    "ld1 { v27.b }[2], [x24]\n"
+    "ld1 { v17.b }[2], [x23]\n"
+    "ld1 { v19.b }[2], [x22]\n"
+    "ld1 { v15.b }[2], [x21]\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v26.b }[0], [x27]\n"
+    "ld1 { v18.b }[0], [x26]\n"
+    "ld1 { v10.b }[0], [x25]\n"
+    "ld1 { v27.b }[0], [x24]\n"
+    "ld1 { v17.b }[0], [x23]\n"
+    "ld1 { v19.b }[0], [x22]\n"
+    "ld1 { v15.b }[0], [x21]\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "smlal v8.4s, v26.4h, v30.4h\n"
+    "smlal2 v2.4s, v26.8h, v30.8h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "smlal v8.4s, v18.4h, v12.4h\n"
+    "smlal2 v2.4s, v18.8h, v12.8h\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal v21.4s, v26.4h, v3.4h\n"
+    "smlal2 v4.4s, v26.8h, v3.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v8.4s, v10.4h, v11.4h\n"
+    "smlal2 v2.4s, v10.8h, v11.8h\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "smlal v21.4s, v27.4h, v11.4h\n"
+    "smlal2 v4.4s, v27.8h, v11.8h\n"
+    "smlal v8.4s, v19.4h, v24.4h\n"
+    "smlal2 v2.4s, v19.8h, v24.8h\n"
+    "ushll v17.8h, v17.8b, #0x0\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal v21.4s, v17.4h, v25.4h\n"
+    "smlal2 v4.4s, v17.8h, v25.8h\n"
+    "smlal v8.4s, v15.4h, v23.4h\n"
+    "smlal2 v2.4s, v15.8h, v23.8h\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "smlal v20.4s, v26.4h, v25.4h\n"
+    "smlal2 v1.4s, v26.8h, v25.8h\n"
+    "smlal v16.4s, v26.4h, v12.4h\n"
+    "smlal2 v14.4s, v26.8h, v12.8h\n"
+    "smlal v8.4s, v28.4h, v25.4h\n"
+    "smlal2 v2.4s, v28.8h, v25.8h\n"
+    "smlal v21.4s, v28.4h, v12.4h\n"
+    "smlal2 v4.4s, v28.8h, v12.8h\n"
+    "tbz x7, #2, 13f\n"
+    "ld1 { v31.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 12f\n"
+    "ld1 { v31.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v31.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v31.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x7, #1, 14f\n"
+    "ld1 { v31.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v31.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v31.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "ushll v31.8h, v31.8b, #0x0\n"
+    "ldr x20, [x15, #0x48]\n"
+    "smlal v21.4s, v31.4h, v23.4h\n"
+    "smlal2 v4.4s, v31.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 17f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 16f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x7, #1, 18f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "ldr x20, [x15, #0x50]\n"
+    "smlal v21.4s, v28.4h, v7.4h\n"
+    "smlal2 v4.4s, v28.8h, v7.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 21f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 20f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (1, 2): Bit 2: Unset
+    "tbz x7, #1, 22f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (1, 2): Bit 2: End
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "ldr x20, [x15, #0x58]\n"
+    "smlal v8.4s, v27.4h, v7.4h\n"
+    "smlal2 v2.4s, v27.8h, v7.8h\n"
+    "smlal v21.4s, v27.4h, v24.4h\n"
+    "smlal2 v4.4s, v27.8h, v24.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 25f\n"
+    "ld1 { v0.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 24f\n"
+    "ld1 { v0.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v0.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v0.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x7, #1, 26f\n"
+    "ld1 { v0.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v0.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v0.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (3, 0): Bit 2: End
+    "ushll v0.8h, v0.8b, #0x0\n"
+    "ldr x20, [x15, #0x60]\n"
+    "smlal v20.4s, v0.4h, v24.4h\n"
+    "smlal2 v1.4s, v0.8h, v24.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 29f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 28f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x7, #1, 30f\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 0): Bit 2: End
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "ldr x20, [x15, #0x68]\n"
+    "smlal v8.4s, v15.4h, v3.4h\n"
+    "smlal2 v2.4s, v15.8h, v3.8h\n"
+    "smlal v20.4s, v15.4h, v12.4h\n"
+    "smlal2 v1.4s, v15.8h, v12.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 33f\n"
+    "ld1 { v0.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 32f\n"
+    "ld1 { v0.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v0.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v0.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x7, #1, 34f\n"
+    "ld1 { v0.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v0.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v0.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (3, 1): Bit 2: End
+    "ushll v0.8h, v0.8b, #0x0\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v20.4s, v0.4h, v23.4h\n"
+    "smlal2 v1.4s, v0.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 37f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 36f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v6.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v6.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x7, #1, 38f\n"
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v6.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v6.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 1): Bit 2: End
+    "ushll v6.8h, v6.8b, #0x0\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v8.4s, v6.4h, v9.4h\n"
+    "smlal2 v2.4s, v6.8h, v9.8h\n"
+    "smlal v20.4s, v6.4h, v11.4h\n"
+    "smlal2 v1.4s, v6.8h, v11.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 41f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 40f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x7, #1, 42f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (3, 3): Bit 2: End
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "ldr x20, [x15, #0x80]\n"
+    "smlal v16.4s, v27.4h, v23.4h\n"
+    "smlal2 v14.4s, v27.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 45f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 44f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x7, #1, 46f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ldr x20, [x15, #0x88]\n"
+    "smlal v21.4s, v10.4h, v9.4h\n"
+    "smlal2 v4.4s, v10.8h, v9.8h\n"
+    "smlal v16.4s, v10.4h, v11.4h\n"
+    "smlal2 v14.4s, v10.8h, v11.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 49f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 48f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x7, #1, 50f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 4): Bit 2: End
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "ldr x20, [x15, #0x90]\n"
+    "smlal v16.4s, v28.4h, v7.4h\n"
+    "smlal2 v14.4s, v28.8h, v7.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 53f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 52f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x7, #1, 54f\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (4, 0): Bit 2: End
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "ldr x20, [x15, #0x98]\n"
+    "smlal v20.4s, v15.4h, v3.4h\n"
+    "smlal2 v1.4s, v15.8h, v3.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 57f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 56f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v6.b }[6], [x20]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v6.b }[4], [x20]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x7, #1, 58f\n"
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v6.b }[2], [x20]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v6.b }[0], [x20]\n"
+    "59:"  // Oddments: Load (2, 4): Bit 2: End
+    "ushll v6.8h, v6.8b, #0x0\n"
+    "ldr x20, [x15, #0xa0]\n"
+    "smlal v21.4s, v6.4h, v30.4h\n"
+    "smlal2 v4.4s, v6.8h, v30.8h\n"
+    "smlal v16.4s, v6.4h, v25.4h\n"
+    "smlal2 v14.4s, v6.8h, v25.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 61f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 60f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x7, #1, 62f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "63:"  // Oddments: Load (4, 1): Bit 2: End
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v20.4s, v23.4h, v9.4h\n"
+    "smlal2 v1.4s, v23.8h, v9.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 65f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 64f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 67f\n"
+    "ld1 { v12.b }[6], [x20]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 67f\n"
+    "ld1 { v12.b }[4], [x20]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x7, #1, 66f\n"
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 67f\n"
+    "ld1 { v12.b }[2], [x20]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 67f\n"
+    "ld1 { v12.b }[0], [x20]\n"
+    "67:"  // Oddments: Load (3, 2): Bit 2: End
+    "ushll v12.8h, v12.8b, #0x0\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "smlal v20.4s, v12.4h, v7.4h\n"
+    "smlal2 v1.4s, v12.8h, v7.8h\n"
+    "smlal v16.4s, v12.4h, v24.4h\n"
+    "smlal2 v14.4s, v12.8h, v24.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 69f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 68f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 71f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 71f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x7, #1, 70f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 71f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 71f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "71:"  // Oddments: Load (4, 3): Bit 2: End
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal v16.4s, v10.4h, v9.4h\n"
+    "smlal2 v14.4s, v10.8h, v9.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 73f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 72f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 75f\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 75f\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x7, #1, 74f\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 75f\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 75f\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 2): Bit 2: End
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "smlal v20.4s, v15.4h, v30.4h\n"
+    "smlal2 v1.4s, v15.8h, v30.8h\n"
+    "smlal v16.4s, v15.4h, v3.4h\n"
+    "smlal2 v14.4s, v15.8h, v3.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 77f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 76f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 79f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 79f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x7, #1, 78f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 79f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 79f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "79:"  // Oddments: Load (4, 4): Bit 2: End
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "smlal v16.4s, v28.4h, v30.4h\n"
+    "smlal2 v14.4s, v28.8h, v30.8h\n"
+    "tbz x7, #2, 81f\n"
+    "ld1 { v19.4s }, [x13], #0x10\n"
+    "ld1 { v23.4s }, [x12], #0x10\n"
+    "tbz x7, #1, 80f\n"
+    "ld1 { v18.d }[0], [x13], #0x8\n"
+    "ld1 { v24.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 83f\n"
+    "ld1 { v18.s }[2], [x13]\n"
+    "ld1 { v24.s }[2], [x12]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 83f\n"
+    "ld1 { v18.s }[0], [x13]\n"
+    "ld1 { v24.s }[0], [x12]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x7, #1, 82f\n"
+    "ld1 { v19.d }[0], [x13], #0x8\n"
+    "ld1 { v23.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 83f\n"
+    "ld1 { v19.s }[2], [x13]\n"
+    "ld1 { v23.s }[2], [x12]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 83f\n"
+    "ld1 { v19.s }[0], [x13]\n"
+    "ld1 { v23.s }[0], [x12]\n"
+    "83:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v8.4s, v8.4s, v19.4s\n"
+    "and v17.16b, v8.16b, v23.16b\n"
+    "add x11, x11, x16\n"
+    "add x10, x10, x16\n"
+    "sqrdmulh v2.4s, v2.4s, v18.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "add x9, x9, x16\n"
+    "add x28, x28, x16\n"
+    "and v11.16b, v2.16b, v24.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v19.4s\n"
+    "sqrdmulh v16.4s, v16.4s, v19.4s\n"
+    "sqadd v8.4s, v8.4s, v17.4s\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "and v28.16b, v21.16b, v23.16b\n"
+    "sqrdmulh v4.4s, v4.4s, v18.4s\n"
+    "and v17.16b, v20.16b, v23.16b\n"
+    "sqrdmulh v1.4s, v1.4s, v18.4s\n"
+    "and v19.16b, v16.16b, v23.16b\n"
+    "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+    "sqadd v2.4s, v2.4s, v11.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "and v18.16b, v4.16b, v24.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v12.16b, v1.16b, v24.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "and v25.16b, v14.16b, v24.16b\n"
+    "sqadd v21.4s, v21.4s, v28.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "sqadd v16.4s, v16.4s, v19.4s\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "srshl v8.4s, v8.4s, v23.4s\n"
+    "srshl v21.4s, v21.4s, v23.4s\n"
+    "sqadd v4.4s, v4.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v23.4s\n"
+    "sqadd v1.4s, v1.4s, v12.4s\n"
+    "srshl v16.4s, v16.4s, v23.4s\n"
+    "sqadd v14.4s, v14.4s, v25.4s\n"
+    "srshl v2.4s, v2.4s, v24.4s\n"
+    "sqxtn v8.4h, v8.4s\n"
+    "srshl v4.4s, v4.4s, v24.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v1.4s, v1.4s, v24.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v14.4s, v14.4s, v24.4s\n"
+    "sqxtn v16.4h, v16.4s\n"
+    "sqxtn2 v8.8h, v2.4s\n"
+    "sqxtn2 v21.8h, v4.4s\n"
+    "sqxtn2 v20.8h, v1.4s\n"
+    "sqxtn2 v16.8h, v14.4s\n"
+    "sqadd v8.8h, v8.8h, v22.8h\n"
+    "sqadd v21.8h, v21.8h, v22.8h\n"
+    "sqadd v20.8h, v20.8h, v22.8h\n"
+    "sqadd v16.8h, v16.8h, v22.8h\n"
+    "smax v8.8h, v8.8h, v13.8h\n"
+    "smax v21.8h, v21.8h, v13.8h\n"
+    "smax v20.8h, v20.8h, v13.8h\n"
+    "smax v16.8h, v16.8h, v13.8h\n"
+    "smin v8.8h, v8.8h, v5.8h\n"
+    "smin v21.8h, v21.8h, v5.8h\n"
+    "smin v20.8h, v20.8h, v5.8h\n"
+    "smin v16.8h, v16.8h, v5.8h\n"
+    "uzp1 v8.16b, v8.16b, v8.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "tbz x7, #2, 85f\n"
+    "st1 { v8.s }[0], [x11], #0x4\n"
+    "st1 { v21.s }[0], [x10], #0x4\n"
+    "st1 { v20.s }[0], [x9], #0x4\n"
+    "st1 { v16.s }[0], [x28], #0x4\n"
+    "tbz x7, #1, 84f\n"
+    "st1 { v8.h }[2], [x11], #0x2\n"
+    "st1 { v21.h }[2], [x10], #0x2\n"
+    "st1 { v20.h }[2], [x9], #0x2\n"
+    "st1 { v16.h }[2], [x28], #0x2\n"
+    "tbz x7, #0, 87f\n"
+    "st1 { v8.b }[6], [x11], #0x1\n"
+    "st1 { v21.b }[6], [x10], #0x1\n"
+    "st1 { v20.b }[6], [x9], #0x1\n"
+    "st1 { v16.b }[6], [x28], #0x1\n"
+    "b 87f\n"
+    "84:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 87f\n"
+    "st1 { v8.b }[4], [x11], #0x1\n"
+    "st1 { v21.b }[4], [x10], #0x1\n"
+    "st1 { v20.b }[4], [x9], #0x1\n"
+    "st1 { v16.b }[4], [x28], #0x1\n"
+    "b 87f\n"
+    "85:"  // Oddments: Bit 2: Unset
+    "tbz x7, #1, 86f\n"
+    "st1 { v8.h }[0], [x11], #0x2\n"
+    "st1 { v21.h }[0], [x10], #0x2\n"
+    "st1 { v20.h }[0], [x9], #0x2\n"
+    "st1 { v16.h }[0], [x28], #0x2\n"
+    "tbz x7, #0, 87f\n"
+    "st1 { v8.b }[2], [x11], #0x1\n"
+    "st1 { v21.b }[2], [x10], #0x1\n"
+    "st1 { v20.b }[2], [x9], #0x1\n"
+    "st1 { v16.b }[2], [x28], #0x1\n"
+    "b 87f\n"
+    "86:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 87f\n"
+    "st1 { v8.b }[0], [x11], #0x1\n"
+    "st1 { v21.b }[0], [x10], #0x1\n"
+    "st1 { v20.b }[0], [x9], #0x1\n"
+    "st1 { v16.b }[0], [x28], #0x1\n"
+    "87:"  // Oddments: Bit 2: End
+    "88:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..39601fd8e4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const uint8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..9316732632
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,2185 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x2, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x3, x2, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v2.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_minval]\n"
+    "ld1r { v25.8h }, [x21]\n"
+    "ld1r { v12.8h }, [x20]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "mov x4, #0x0\n"
+    "ld1r { v26.8h }, [x20]\n"
+    "mov x5, #0x0\n"
+    "add x6, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x7, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x8, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x17, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x16, x15, [x22, #0x0]\n"
+    "ldp x14, x13, [x22, #0x10]\n"
+    "cbz x3, 3f\n"
+    "ldr d21, [x7, #0x0]\n"
+    "ldr d15, [x7, #0x8]\n"
+    "subs x3, x3, #0x1\n"
+    "usubl v21.8h, v21.8b, v2.8b\n"
+    "ldr d29, [x7, #0x10]\n"
+    "ldr d18, [x7, #0x18]\n"
+    "usubl v15.8h, v15.8b, v2.8b\n"
+    "usubl v29.8h, v29.8b, v2.8b\n"
+    "ldr d3, [x7, #0x20]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "usubl v18.8h, v18.8b, v2.8b\n"
+    "usubl v3.8h, v3.8b, v2.8b\n"
+    "ldr q13, [x20, #0x0]\n"
+    "ldr q24, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x9, x28, [x6, #0x0]\n"
+    "ldp x27, x26, [x6, #0x10]\n"
+    "mov v7.16b, v13.16b\n"
+    "mov v14.16b, v24.16b\n"
+    "ldp x25, x24, [x6, #0x20]\n"
+    "ldp x23, x22, [x6, #0x30]\n"
+    "mov v27.16b, v13.16b\n"
+    "mov v22.16b, v24.16b\n"
+    "ldp x21, x20, [x6, #0x40]\n"
+    "ldr d10, [x9, x4]\n"
+    "mov v8.16b, v13.16b\n"
+    "mov v17.16b, v24.16b\n"
+    "ldr d16, [x28, x4]\n"
+    "ldr d23, [x27, x4]\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ushll v16.8h, v16.8b, #0x0\n"
+    "ldr d30, [x26, x4]\n"
+    "ldr d4, [x25, x4]\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ushll v30.8h, v30.8b, #0x0\n"
+    "ldr d28, [x24, x4]\n"
+    "ldr d31, [x23, x4]\n"
+    "ushll v4.8h, v4.8b, #0x0\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "ldr d1, [x22, x4]\n"
+    "ldr d9, [x21, x4]\n"
+    "ushll v31.8h, v31.8b, #0x0\n"
+    "ushll v1.8h, v1.8b, #0x0\n"
+    "ldr d11, [x20, x4]\n"
+    "ushll v9.8h, v9.8b, #0x0\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr d5, [x7, #0x28]\n"
+    "ldr d6, [x7, #0x30]\n"
+    "smlal v13.4s, v10.4h, v21.4h\n"
+    "smlal2 v24.4s, v10.8h, v21.8h\n"
+    "ldr d19, [x7, #0x38]\n"
+    "ldr d0, [x7, #0x40]\n"
+    "smlal v13.4s, v16.4h, v15.4h\n"
+    "smlal v7.4s, v16.4h, v21.4h\n"
+    "ldr d10, [x7, #0x48]\n"
+    "ldr d20, [x7, #0x50]\n"
+    "smlal v27.4s, v23.4h, v21.4h\n"
+    "smlal v8.4s, v30.4h, v21.4h\n"
+    "ldr x21, [x6, #0x50]\n"
+    "smlal2 v24.4s, v16.8h, v15.8h\n"
+    "smlal v13.4s, v4.4h, v29.4h\n"
+    "ldr x20, [x6, #0x58]\n"
+    "smlal2 v14.4s, v16.8h, v21.8h\n"
+    "ldr d16, [x21, x4]\n"
+    "smlal2 v22.4s, v23.8h, v21.8h\n"
+    "ushll v16.8h, v16.8b, #0x0\n"
+    "smlal2 v17.4s, v30.8h, v21.8h\n"
+    "ldr d21, [x20, x4]\n"
+    "smlal v7.4s, v4.4h, v15.4h\n"
+    "ldr x22, [x6, #0x60]\n"
+    "smlal v27.4s, v30.4h, v15.4h\n"
+    "smlal v8.4s, v28.4h, v15.4h\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "ldr x20, [x6, #0x68]\n"
+    "smlal2 v24.4s, v4.8h, v29.8h\n"
+    "smlal v13.4s, v31.4h, v18.4h\n"
+    "usubl v5.8h, v5.8b, v2.8b\n"
+    "ldr x21, [x6, #0x70]\n"
+    "smlal2 v14.4s, v4.8h, v15.8h\n"
+    "ldr d4, [x22, x4]\n"
+    "smlal2 v22.4s, v30.8h, v15.8h\n"
+    "ushll v4.8h, v4.8b, #0x0\n"
+    "smlal2 v17.4s, v28.8h, v15.8h\n"
+    "ldr d15, [x20, x4]\n"
+    "smlal v7.4s, v31.4h, v29.4h\n"
+    "usubl v6.8h, v6.8b, v2.8b\n"
+    "smlal v27.4s, v28.4h, v29.4h\n"
+    "smlal v8.4s, v16.4h, v29.4h\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "ldr x20, [x6, #0x78]\n"
+    "smlal2 v24.4s, v31.8h, v18.8h\n"
+    "smlal v13.4s, v1.4h, v3.4h\n"
+    "usubl v19.8h, v19.8b, v2.8b\n"
+    "ldr x22, [x6, #0x80]\n"
+    "smlal2 v14.4s, v31.8h, v29.8h\n"
+    "ldr d31, [x21, x4]\n"
+    "smlal2 v22.4s, v28.8h, v29.8h\n"
+    "ushll v31.8h, v31.8b, #0x0\n"
+    "smlal2 v17.4s, v16.8h, v29.8h\n"
+    "ldr d29, [x20, x4]\n"
+    "smlal v7.4s, v1.4h, v18.4h\n"
+    "usubl v0.8h, v0.8b, v2.8b\n"
+    "smlal v27.4s, v16.4h, v18.4h\n"
+    "smlal v8.4s, v21.4h, v18.4h\n"
+    "ushll v29.8h, v29.8b, #0x0\n"
+    "ldr x20, [x6, #0x88]\n"
+    "smlal2 v24.4s, v1.8h, v3.8h\n"
+    "smlal v13.4s, v23.4h, v5.4h\n"
+    "usubl v10.8h, v10.8b, v2.8b\n"
+    "ldr x21, [x6, #0x90]\n"
+    "smlal2 v14.4s, v1.8h, v18.8h\n"
+    "ldr d1, [x22, x4]\n"
+    "smlal2 v22.4s, v16.8h, v18.8h\n"
+    "ushll v1.8h, v1.8b, #0x0\n"
+    "smlal2 v17.4s, v21.8h, v18.8h\n"
+    "ldr d18, [x20, x4]\n"
+    "smlal v7.4s, v4.4h, v3.4h\n"
+    "usubl v20.8h, v20.8b, v2.8b\n"
+    "smlal v27.4s, v21.4h, v3.4h\n"
+    "smlal v8.4s, v9.4h, v3.4h\n"
+    "ldr x20, [x6, #0x98]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "smlal2 v24.4s, v23.8h, v5.8h\n"
+    "ldr d23, [x7, #0x58]\n"
+    "smlal v13.4s, v30.4h, v6.4h\n"
+    "usubl v23.8h, v23.8b, v2.8b\n"
+    "smlal2 v14.4s, v4.8h, v3.8h\n"
+    "ldr d4, [x21, x4]\n"
+    "smlal2 v22.4s, v21.8h, v3.8h\n"
+    "ldr x23, [x6, #0xa0]\n"
+    "smlal2 v17.4s, v9.8h, v3.8h\n"
+    "ldr d3, [x20, x4]\n"
+    "smlal v7.4s, v30.4h, v5.4h\n"
+    "ushll v4.8h, v4.8b, #0x0\n"
+    "smlal v27.4s, v11.4h, v5.4h\n"
+    "smlal v8.4s, v15.4h, v5.4h\n"
+    "ushll v3.8h, v3.8b, #0x0\n"
+    "ldr x22, [x6, #0xa8]\n"
+    "smlal2 v24.4s, v30.8h, v6.8h\n"
+    "smlal v13.4s, v28.4h, v19.4h\n"
+    "ldr x21, [x6, #0xb0]\n"
+    "ldr x20, [x6, #0xb8]\n"
+    "smlal2 v14.4s, v30.8h, v5.8h\n"
+    "ldr d30, [x7, #0x60]\n"
+    "smlal2 v22.4s, v11.8h, v5.8h\n"
+    "usubl v30.8h, v30.8b, v2.8b\n"
+    "smlal2 v17.4s, v15.8h, v5.8h\n"
+    "ldr d5, [x23, x4]\n"
+    "smlal v7.4s, v28.4h, v6.4h\n"
+    "ushll v5.8h, v5.8b, #0x0\n"
+    "smlal v27.4s, v15.4h, v6.4h\n"
+    "smlal v8.4s, v31.4h, v6.4h\n"
+    "ldr x12, [x6, #0xc0]\n"
+    "ldr x11, [x6, #0xc8]\n"
+    "smlal2 v24.4s, v28.8h, v19.8h\n"
+    "smlal v13.4s, v16.4h, v0.4h\n"
+    "ldr x10, [x6, #0xd0]\n"
+    "ldr x9, [x6, #0xd8]\n"
+    "smlal2 v14.4s, v28.8h, v6.8h\n"
+    "ldr d28, [x7, #0x68]\n"
+    "smlal2 v22.4s, v15.8h, v6.8h\n"
+    "usubl v28.8h, v28.8b, v2.8b\n"
+    "smlal2 v17.4s, v31.8h, v6.8h\n"
+    "ldr d6, [x22, x4]\n"
+    "smlal v7.4s, v16.4h, v19.4h\n"
+    "ushll v6.8h, v6.8b, #0x0\n"
+    "smlal v27.4s, v31.4h, v19.4h\n"
+    "smlal v8.4s, v29.4h, v19.4h\n"
+    "ldr x28, [x6, #0xe0]\n"
+    "ldr x27, [x6, #0xe8]\n"
+    "smlal2 v24.4s, v16.8h, v0.8h\n"
+    "smlal v13.4s, v21.4h, v10.4h\n"
+    "ldr x26, [x6, #0xf0]\n"
+    "ldr x25, [x6, #0xf8]\n"
+    "smlal2 v14.4s, v16.8h, v19.8h\n"
+    "ldr d16, [x7, #0x70]\n"
+    "smlal2 v22.4s, v31.8h, v19.8h\n"
+    "usubl v16.8h, v16.8b, v2.8b\n"
+    "smlal2 v17.4s, v29.8h, v19.8h\n"
+    "ldr d19, [x21, x4]\n"
+    "smlal v7.4s, v21.4h, v0.4h\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "smlal v27.4s, v29.4h, v0.4h\n"
+    "smlal v8.4s, v1.4h, v0.4h\n"
+    "ldr x24, [x6, #0x100]\n"
+    "ldr x23, [x6, #0x108]\n"
+    "smlal2 v24.4s, v21.8h, v10.8h\n"
+    "smlal v13.4s, v11.4h, v20.4h\n"
+    "ldr x22, [x6, #0x110]\n"
+    "ldr x21, [x6, #0x118]\n"
+    "smlal2 v14.4s, v21.8h, v0.8h\n"
+    "ldr d21, [x7, #0x78]\n"
+    "smlal2 v22.4s, v29.8h, v0.8h\n"
+    "usubl v21.8h, v21.8b, v2.8b\n"
+    "smlal2 v17.4s, v1.8h, v0.8h\n"
+    "ldr d0, [x20, x4]\n"
+    "smlal v7.4s, v9.4h, v10.4h\n"
+    "ushll v0.8h, v0.8b, #0x0\n"
+    "smlal v27.4s, v1.4h, v10.4h\n"
+    "smlal v8.4s, v18.4h, v10.4h\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "subs x3, x3, #0x1\n"
+    "smlal2 v24.4s, v11.8h, v20.8h\n"
+    "ldr d11, [x7, #0x80]\n"
+    "smlal v13.4s, v15.4h, v23.4h\n"
+    "usubl v11.8h, v11.8b, v2.8b\n"
+    "smlal2 v14.4s, v9.8h, v10.8h\n"
+    "ldr d9, [x12, x4]\n"
+    "smlal2 v22.4s, v1.8h, v10.8h\n"
+    "ushll v9.8h, v9.8b, #0x0\n"
+    "smlal2 v17.4s, v18.8h, v10.8h\n"
+    "ldr d10, [x11, x4]\n"
+    "smlal v7.4s, v15.4h, v20.4h\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal v27.4s, v4.4h, v20.4h\n"
+    "smlal v8.4s, v3.4h, v20.4h\n"
+    "smlal2 v24.4s, v15.8h, v23.8h\n"
+    "smlal v13.4s, v31.4h, v30.4h\n"
+    "smlal2 v14.4s, v15.8h, v20.8h\n"
+    "ldr d15, [x7, #0x88]\n"
+    "smlal2 v22.4s, v4.8h, v20.8h\n"
+    "usubl v15.8h, v15.8b, v2.8b\n"
+    "smlal2 v17.4s, v3.8h, v20.8h\n"
+    "ldr d20, [x10, x4]\n"
+    "smlal v7.4s, v31.4h, v23.4h\n"
+    "ushll v20.8h, v20.8b, #0x0\n"
+    "smlal v27.4s, v3.4h, v23.4h\n"
+    "smlal v8.4s, v5.4h, v23.4h\n"
+    "smlal2 v24.4s, v31.8h, v30.8h\n"
+    "smlal v13.4s, v29.4h, v28.4h\n"
+    "smlal2 v14.4s, v31.8h, v23.8h\n"
+    "ldr d31, [x7, #0x90]\n"
+    "smlal2 v22.4s, v3.8h, v23.8h\n"
+    "usubl v31.8h, v31.8b, v2.8b\n"
+    "smlal2 v17.4s, v5.8h, v23.8h\n"
+    "ldr d23, [x9, x4]\n"
+    "smlal v7.4s, v29.4h, v30.4h\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "smlal v27.4s, v5.4h, v30.4h\n"
+    "smlal v8.4s, v6.4h, v30.4h\n"
+    "smlal2 v24.4s, v29.8h, v28.8h\n"
+    "smlal v13.4s, v1.4h, v16.4h\n"
+    "smlal2 v14.4s, v29.8h, v30.8h\n"
+    "ldr d29, [x7, #0x98]\n"
+    "smlal2 v22.4s, v5.8h, v30.8h\n"
+    "usubl v29.8h, v29.8b, v2.8b\n"
+    "smlal2 v17.4s, v6.8h, v30.8h\n"
+    "ldr d30, [x28, x4]\n"
+    "smlal v7.4s, v1.4h, v28.4h\n"
+    "ushll v30.8h, v30.8b, #0x0\n"
+    "smlal v27.4s, v6.4h, v28.4h\n"
+    "smlal v8.4s, v19.4h, v28.4h\n"
+    "smlal2 v24.4s, v1.8h, v16.8h\n"
+    "smlal v13.4s, v4.4h, v21.4h\n"
+    "smlal2 v14.4s, v1.8h, v28.8h\n"
+    "ldr d1, [x7, #0xa0]\n"
+    "smlal2 v22.4s, v6.8h, v28.8h\n"
+    "usubl v1.8h, v1.8b, v2.8b\n"
+    "smlal2 v17.4s, v19.8h, v28.8h\n"
+    "ldr d28, [x27, x4]\n"
+    "smlal v7.4s, v18.4h, v16.4h\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "smlal v27.4s, v19.4h, v16.4h\n"
+    "smlal v8.4s, v0.4h, v16.4h\n"
+    "smlal2 v24.4s, v4.8h, v21.8h\n"
+    "ldr d4, [x7, #0xa8]\n"
+    "smlal v13.4s, v3.4h, v11.4h\n"
+    "usubl v4.8h, v4.8b, v2.8b\n"
+    "smlal2 v14.4s, v18.8h, v16.8h\n"
+    "ldr d18, [x26, x4]\n"
+    "smlal2 v22.4s, v19.8h, v16.8h\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "smlal2 v17.4s, v0.8h, v16.8h\n"
+    "ldr d16, [x25, x4]\n"
+    "smlal v7.4s, v3.4h, v21.4h\n"
+    "ushll v16.8h, v16.8b, #0x0\n"
+    "smlal v27.4s, v9.4h, v21.4h\n"
+    "smlal v8.4s, v10.4h, v21.4h\n"
+    "smlal2 v24.4s, v3.8h, v11.8h\n"
+    "smlal v13.4s, v5.4h, v15.4h\n"
+    "smlal2 v14.4s, v3.8h, v21.8h\n"
+    "ldr d3, [x7, #0xb0]\n"
+    "smlal2 v22.4s, v9.8h, v21.8h\n"
+    "usubl v3.8h, v3.8b, v2.8b\n"
+    "smlal2 v17.4s, v10.8h, v21.8h\n"
+    "ldr d21, [x24, x4]\n"
+    "smlal v7.4s, v5.4h, v11.4h\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "smlal v27.4s, v10.4h, v11.4h\n"
+    "smlal v8.4s, v20.4h, v11.4h\n"
+    "smlal2 v24.4s, v5.8h, v15.8h\n"
+    "smlal v13.4s, v6.4h, v31.4h\n"
+    "smlal2 v14.4s, v5.8h, v11.8h\n"
+    "ldr d5, [x7, #0xb8]\n"
+    "smlal2 v22.4s, v10.8h, v11.8h\n"
+    "usubl v5.8h, v5.8b, v2.8b\n"
+    "smlal2 v17.4s, v20.8h, v11.8h\n"
+    "ldr d11, [x23, x4]\n"
+    "smlal v7.4s, v6.4h, v15.4h\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "smlal v27.4s, v20.4h, v15.4h\n"
+    "smlal v8.4s, v23.4h, v15.4h\n"
+    "smlal2 v24.4s, v6.8h, v31.8h\n"
+    "smlal v13.4s, v19.4h, v29.4h\n"
+    "smlal2 v14.4s, v6.8h, v15.8h\n"
+    "ldr d6, [x7, #0xc0]\n"
+    "smlal2 v22.4s, v20.8h, v15.8h\n"
+    "usubl v6.8h, v6.8b, v2.8b\n"
+    "smlal2 v17.4s, v23.8h, v15.8h\n"
+    "ldr d15, [x22, x4]\n"
+    "smlal v7.4s, v19.4h, v31.4h\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal v27.4s, v23.4h, v31.4h\n"
+    "smlal v8.4s, v30.4h, v31.4h\n"
+    "add x7, x7, #0xc8\n"
+    "smlal2 v24.4s, v19.8h, v29.8h\n"
+    "smlal v13.4s, v9.4h, v1.4h\n"
+    "smlal2 v14.4s, v19.8h, v31.8h\n"
+    "ldr d19, [x21, x4]\n"
+    "smlal2 v22.4s, v23.8h, v31.8h\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "smlal2 v17.4s, v30.8h, v31.8h\n"
+    "ldr q31, [x8, #0x0]\n"
+    "smlal v7.4s, v0.4h, v29.4h\n"
+    "add x4, x4, #0x8\n"
+    "smlal v27.4s, v30.4h, v29.4h\n"
+    "smlal v8.4s, v28.4h, v29.4h\n"
+    "smlal2 v24.4s, v9.8h, v1.8h\n"
+    "ldr q9, [x17, #0x0]\n"
+    "smlal v13.4s, v10.4h, v4.4h\n"
+    "smlal2 v14.4s, v0.8h, v29.8h\n"
+    "ldr q0, [x8, #0x10]\n"
+    "smlal2 v22.4s, v30.8h, v29.8h\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v17.4s, v28.8h, v29.8h\n"
+    "ldr q29, [x17, #0x10]\n"
+    "smlal v7.4s, v10.4h, v1.4h\n"
+    "add x17, x17, #0x20\n"
+    "smlal v27.4s, v18.4h, v1.4h\n"
+    "smlal v8.4s, v16.4h, v1.4h\n"
+    "smlal2 v24.4s, v10.8h, v4.8h\n"
+    "smlal v13.4s, v20.4h, v3.4h\n"
+    "smlal2 v14.4s, v10.8h, v1.8h\n"
+    "smlal2 v22.4s, v18.8h, v1.8h\n"
+    "smlal2 v17.4s, v16.8h, v1.8h\n"
+    "smlal v7.4s, v20.4h, v4.4h\n"
+    "smlal v27.4s, v16.4h, v4.4h\n"
+    "smlal v8.4s, v21.4h, v4.4h\n"
+    "smlal2 v24.4s, v20.8h, v3.8h\n"
+    "smlal v13.4s, v23.4h, v5.4h\n"
+    "smlal2 v14.4s, v20.8h, v4.8h\n"
+    "smlal2 v22.4s, v16.8h, v4.8h\n"
+    "smlal2 v17.4s, v21.8h, v4.8h\n"
+    "smlal v7.4s, v23.4h, v3.4h\n"
+    "smlal v27.4s, v21.4h, v3.4h\n"
+    "smlal v8.4s, v11.4h, v3.4h\n"
+    "smlal2 v24.4s, v23.8h, v5.8h\n"
+    "smlal v13.4s, v30.4h, v6.4h\n"
+    "sqrdmulh v13.4s, v13.4s, v31.4s\n"
+    "smlal2 v14.4s, v23.8h, v3.8h\n"
+    "smlal2 v22.4s, v21.8h, v3.8h\n"
+    "and v23.16b, v13.16b, v9.16b\n"
+    "smlal2 v17.4s, v11.8h, v3.8h\n"
+    "smlal v7.4s, v30.4h, v5.4h\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "smlal v27.4s, v11.4h, v5.4h\n"
+    "smlal v8.4s, v15.4h, v5.4h\n"
+    "sqadd v13.4s, v13.4s, v23.4s\n"
+    "smlal2 v24.4s, v30.8h, v6.8h\n"
+    "smlal2 v14.4s, v30.8h, v5.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v0.4s\n"
+    "smlal2 v22.4s, v11.8h, v5.8h\n"
+    "smlal2 v17.4s, v15.8h, v5.8h\n"
+    "and v10.16b, v24.16b, v29.16b\n"
+    "smlal v7.4s, v28.4h, v6.4h\n"
+    "smlal v27.4s, v15.4h, v6.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v31.4s\n"
+    "smlal v8.4s, v19.4h, v6.4h\n"
+    "smlal2 v14.4s, v28.8h, v6.8h\n"
+    "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+    "smlal2 v22.4s, v15.8h, v6.8h\n"
+    "smlal2 v17.4s, v19.8h, v6.8h\n"
+    "sqrdmulh v8.4s, v8.4s, v31.4s\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "and v28.16b, v7.16b, v9.16b\n"
+    "sqrdmulh v14.4s, v14.4s, v0.4s\n"
+    "and v20.16b, v27.16b, v9.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v0.4s\n"
+    "and v23.16b, v8.16b, v9.16b\n"
+    "sqrdmulh v17.4s, v17.4s, v0.4s\n"
+    "sqadd v24.4s, v24.4s, v10.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "and v18.16b, v14.16b, v29.16b\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "and v30.16b, v22.16b, v29.16b\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v5.16b, v17.16b, v29.16b\n"
+    "sqadd v7.4s, v7.4s, v28.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "sqadd v8.4s, v8.4s, v23.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "srshl v13.4s, v13.4s, v9.4s\n"
+    "srshl v7.4s, v7.4s, v9.4s\n"
+    "sqadd v14.4s, v14.4s, v18.4s\n"
+    "srshl v27.4s, v27.4s, v9.4s\n"
+    "sqadd v22.4s, v22.4s, v30.4s\n"
+    "srshl v8.4s, v8.4s, v9.4s\n"
+    "sqadd v17.4s, v17.4s, v5.4s\n"
+    "srshl v24.4s, v24.4s, v29.4s\n"
+    "sqxtn v13.4h, v13.4s\n"
+    "srshl v14.4s, v14.4s, v29.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v22.4s, v22.4s, v29.4s\n"
+    "sqxtn v27.4h, v27.4s\n"
+    "srshl v17.4s, v17.4s, v29.4s\n"
+    "sqxtn v8.4h, v8.4s\n"
+    "sqxtn2 v13.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v14.4s\n"
+    "sqxtn2 v27.8h, v22.4s\n"
+    "sqxtn2 v8.8h, v17.4s\n"
+    "sqadd v13.8h, v13.8h, v25.8h\n"
+    "sqadd v7.8h, v7.8h, v25.8h\n"
+    "sqadd v27.8h, v27.8h, v25.8h\n"
+    "sqadd v8.8h, v8.8h, v25.8h\n"
+    "smax v13.8h, v13.8h, v12.8h\n"
+    "smax v7.8h, v7.8h, v12.8h\n"
+    "smax v27.8h, v27.8h, v12.8h\n"
+    "smax v8.8h, v8.8h, v12.8h\n"
+    "smin v13.8h, v13.8h, v26.8h\n"
+    "smin v7.8h, v7.8h, v26.8h\n"
+    "smin v27.8h, v27.8h, v26.8h\n"
+    "smin v8.8h, v8.8h, v26.8h\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "str d13, [x16, x5]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str d7, [x15, x5]\n"
+    "uzp1 v8.16b, v8.16b, v8.16b\n"
+    "str d27, [x14, x5]\n"
+    "str d8, [x13, x5]\n"
+    "ldr q13, [x20, #0x0]\n"
+    "ldr q24, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d21, [x7, #0x0]\n"
+    "ldr d15, [x7, #0x8]\n"
+    "add x5, x5, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d29, [x7, #0x10]\n"
+    "ldr d18, [x7, #0x18]\n"
+    "mov v7.16b, v13.16b\n"
+    "mov v14.16b, v24.16b\n"
+    "ldr d3, [x7, #0x20]\n"
+    "ldp x9, x28, [x6, #0x0]\n"
+    "mov v27.16b, v13.16b\n"
+    "mov v22.16b, v24.16b\n"
+    "ldp x27, x26, [x6, #0x10]\n"
+    "ldp x25, x24, [x6, #0x20]\n"
+    "mov v8.16b, v13.16b\n"
+    "mov v17.16b, v24.16b\n"
+    "ldp x23, x22, [x6, #0x30]\n"
+    "ldp x21, x20, [x6, #0x40]\n"
+    "usubl v21.8h, v21.8b, v2.8b\n"
+    "usubl v15.8h, v15.8b, v2.8b\n"
+    "ldr d10, [x9, x4]\n"
+    "ldr d16, [x28, x4]\n"
+    "usubl v29.8h, v29.8b, v2.8b\n"
+    "usubl v18.8h, v18.8b, v2.8b\n"
+    "ldr d23, [x27, x4]\n"
+    "ldr d30, [x26, x4]\n"
+    "usubl v3.8h, v3.8b, v2.8b\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ldr d4, [x25, x4]\n"
+    "ldr d28, [x24, x4]\n"
+    "ushll v16.8h, v16.8b, #0x0\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ldr d31, [x23, x4]\n"
+    "ldr d1, [x22, x4]\n"
+    "ushll v30.8h, v30.8b, #0x0\n"
+    "ushll v4.8h, v4.8b, #0x0\n"
+    "ldr d9, [x21, x4]\n"
+    "ldr d11, [x20, x4]\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "ushll v31.8h, v31.8b, #0x0\n"
+    "ushll v1.8h, v1.8b, #0x0\n"
+    "ushll v9.8h, v9.8b, #0x0\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr d0, [x7, #0x28]\n"
+    "ldr d20, [x7, #0x30]\n"
+    "smlal v13.4s, v10.4h, v21.4h\n"
+    "smlal2 v24.4s, v10.8h, v21.8h\n"
+    "ldr d6, [x7, #0x38]\n"
+    "ldr d19, [x7, #0x40]\n"
+    "smlal v13.4s, v16.4h, v15.4h\n"
+    "smlal v7.4s, v16.4h, v21.4h\n"
+    "ldr d10, [x7, #0x48]\n"
+    "ldr d5, [x7, #0x50]\n"
+    "smlal v27.4s, v23.4h, v21.4h\n"
+    "smlal v8.4s, v30.4h, v21.4h\n"
+    "ldr x21, [x6, #0x50]\n"
+    "smlal2 v24.4s, v16.8h, v15.8h\n"
+    "smlal v13.4s, v4.4h, v29.4h\n"
+    "ldr x20, [x6, #0x58]\n"
+    "smlal2 v14.4s, v16.8h, v21.8h\n"
+    "ldr d16, [x21, x4]\n"
+    "smlal2 v22.4s, v23.8h, v21.8h\n"
+    "ushll v16.8h, v16.8b, #0x0\n"
+    "smlal2 v17.4s, v30.8h, v21.8h\n"
+    "ldr d21, [x20, x4]\n"
+    "smlal v7.4s, v4.4h, v15.4h\n"
+    "ldr x22, [x6, #0x60]\n"
+    "smlal v27.4s, v30.4h, v15.4h\n"
+    "smlal v8.4s, v28.4h, v15.4h\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "ldr x20, [x6, #0x68]\n"
+    "smlal2 v24.4s, v4.8h, v29.8h\n"
+    "smlal v13.4s, v31.4h, v18.4h\n"
+    "usubl v0.8h, v0.8b, v2.8b\n"
+    "ldr x21, [x6, #0x70]\n"
+    "smlal2 v14.4s, v4.8h, v15.8h\n"
+    "ldr d4, [x22, x4]\n"
+    "smlal2 v22.4s, v30.8h, v15.8h\n"
+    "ushll v4.8h, v4.8b, #0x0\n"
+    "smlal2 v17.4s, v28.8h, v15.8h\n"
+    "ldr d15, [x20, x4]\n"
+    "smlal v7.4s, v31.4h, v29.4h\n"
+    "usubl v20.8h, v20.8b, v2.8b\n"
+    "smlal v27.4s, v28.4h, v29.4h\n"
+    "smlal v8.4s, v16.4h, v29.4h\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "ldr x20, [x6, #0x78]\n"
+    "smlal2 v24.4s, v31.8h, v18.8h\n"
+    "smlal v13.4s, v1.4h, v3.4h\n"
+    "usubl v6.8h, v6.8b, v2.8b\n"
+    "ldr x22, [x6, #0x80]\n"
+    "smlal2 v14.4s, v31.8h, v29.8h\n"
+    "ldr d31, [x21, x4]\n"
+    "smlal2 v22.4s, v28.8h, v29.8h\n"
+    "ushll v31.8h, v31.8b, #0x0\n"
+    "smlal2 v17.4s, v16.8h, v29.8h\n"
+    "ldr d29, [x20, x4]\n"
+    "smlal v7.4s, v1.4h, v18.4h\n"
+    "usubl v19.8h, v19.8b, v2.8b\n"
+    "smlal v27.4s, v16.4h, v18.4h\n"
+    "smlal v8.4s, v21.4h, v18.4h\n"
+    "ushll v29.8h, v29.8b, #0x0\n"
+    "ldr x20, [x6, #0x88]\n"
+    "smlal2 v24.4s, v1.8h, v3.8h\n"
+    "smlal v13.4s, v23.4h, v0.4h\n"
+    "usubl v10.8h, v10.8b, v2.8b\n"
+    "ldr x21, [x6, #0x90]\n"
+    "smlal2 v14.4s, v1.8h, v18.8h\n"
+    "ldr d1, [x22, x4]\n"
+    "smlal2 v22.4s, v16.8h, v18.8h\n"
+    "ushll v1.8h, v1.8b, #0x0\n"
+    "smlal2 v17.4s, v21.8h, v18.8h\n"
+    "ldr d18, [x20, x4]\n"
+    "smlal v7.4s, v4.4h, v3.4h\n"
+    "usubl v5.8h, v5.8b, v2.8b\n"
+    "smlal v27.4s, v21.4h, v3.4h\n"
+    "smlal v8.4s, v9.4h, v3.4h\n"
+    "ldr x20, [x6, #0x98]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "smlal2 v24.4s, v23.8h, v0.8h\n"
+    "ldr d23, [x7, #0x58]\n"
+    "smlal v13.4s, v30.4h, v20.4h\n"
+    "usubl v23.8h, v23.8b, v2.8b\n"
+    "smlal2 v14.4s, v4.8h, v3.8h\n"
+    "ldr d4, [x21, x4]\n"
+    "smlal2 v22.4s, v21.8h, v3.8h\n"
+    "ldr x22, [x6, #0xa0]\n"
+    "smlal2 v17.4s, v9.8h, v3.8h\n"
+    "ldr d3, [x20, x4]\n"
+    "smlal v7.4s, v30.4h, v0.4h\n"
+    "ushll v4.8h, v4.8b, #0x0\n"
+    "smlal v27.4s, v11.4h, v0.4h\n"
+    "smlal v8.4s, v15.4h, v0.4h\n"
+    "ushll v3.8h, v3.8b, #0x0\n"
+    "ldr x21, [x6, #0xa8]\n"
+    "smlal2 v24.4s, v30.8h, v20.8h\n"
+    "smlal v13.4s, v28.4h, v6.4h\n"
+    "ldr x20, [x6, #0xb0]\n"
+    "ldr x12, [x6, #0xb8]\n"
+    "smlal2 v14.4s, v30.8h, v0.8h\n"
+    "ldr d30, [x7, #0x60]\n"
+    "smlal2 v22.4s, v11.8h, v0.8h\n"
+    "usubl v30.8h, v30.8b, v2.8b\n"
+    "smlal2 v17.4s, v15.8h, v0.8h\n"
+    "ldr d0, [x22, x4]\n"
+    "smlal v7.4s, v28.4h, v20.4h\n"
+    "ushll v0.8h, v0.8b, #0x0\n"
+    "smlal v27.4s, v15.4h, v20.4h\n"
+    "smlal v8.4s, v31.4h, v20.4h\n"
+    "ldr x11, [x6, #0xc0]\n"
+    "ldr x10, [x6, #0xc8]\n"
+    "smlal2 v24.4s, v28.8h, v6.8h\n"
+    "smlal v13.4s, v16.4h, v19.4h\n"
+    "ldr x9, [x6, #0xd0]\n"
+    "ldr x28, [x6, #0xd8]\n"
+    "smlal2 v14.4s, v28.8h, v20.8h\n"
+    "ldr d28, [x7, #0x68]\n"
+    "smlal2 v22.4s, v15.8h, v20.8h\n"
+    "usubl v28.8h, v28.8b, v2.8b\n"
+    "smlal2 v17.4s, v31.8h, v20.8h\n"
+    "ldr d20, [x21, x4]\n"
+    "smlal v7.4s, v16.4h, v6.4h\n"
+    "ushll v20.8h, v20.8b, #0x0\n"
+    "smlal v27.4s, v31.4h, v6.4h\n"
+    "smlal v8.4s, v29.4h, v6.4h\n"
+    "ldr x27, [x6, #0xe0]\n"
+    "ldr x26, [x6, #0xe8]\n"
+    "smlal2 v24.4s, v16.8h, v19.8h\n"
+    "smlal v13.4s, v21.4h, v10.4h\n"
+    "ldr x25, [x6, #0xf0]\n"
+    "ldr x24, [x6, #0xf8]\n"
+    "smlal2 v14.4s, v16.8h, v6.8h\n"
+    "ldr d16, [x7, #0x70]\n"
+    "smlal2 v22.4s, v31.8h, v6.8h\n"
+    "usubl v16.8h, v16.8b, v2.8b\n"
+    "smlal2 v17.4s, v29.8h, v6.8h\n"
+    "ldr d6, [x20, x4]\n"
+    "smlal v7.4s, v21.4h, v19.4h\n"
+    "ushll v6.8h, v6.8b, #0x0\n"
+    "smlal v27.4s, v29.4h, v19.4h\n"
+    "smlal v8.4s, v1.4h, v19.4h\n"
+    "ldr x23, [x6, #0x100]\n"
+    "ldr x22, [x6, #0x108]\n"
+    "smlal2 v24.4s, v21.8h, v10.8h\n"
+    "smlal v13.4s, v11.4h, v5.4h\n"
+    "ldr x21, [x6, #0x110]\n"
+    "ldr x20, [x6, #0x118]\n"
+    "smlal2 v14.4s, v21.8h, v19.8h\n"
+    "ldr d21, [x7, #0x78]\n"
+    "smlal2 v22.4s, v29.8h, v19.8h\n"
+    "usubl v21.8h, v21.8b, v2.8b\n"
+    "smlal2 v17.4s, v1.8h, v19.8h\n"
+    "ldr d19, [x12, x4]\n"
+    "smlal v7.4s, v9.4h, v10.4h\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "smlal v27.4s, v1.4h, v10.4h\n"
+    "smlal v8.4s, v18.4h, v10.4h\n"
+    "tst x2, #0x7\n"
+    "smlal2 v24.4s, v11.8h, v5.8h\n"
+    "ldr d11, [x7, #0x80]\n"
+    "smlal v13.4s, v15.4h, v23.4h\n"
+    "usubl v11.8h, v11.8b, v2.8b\n"
+    "smlal2 v14.4s, v9.8h, v10.8h\n"
+    "ldr d9, [x11, x4]\n"
+    "smlal2 v22.4s, v1.8h, v10.8h\n"
+    "ushll v9.8h, v9.8b, #0x0\n"
+    "smlal2 v17.4s, v18.8h, v10.8h\n"
+    "ldr d10, [x10, x4]\n"
+    "smlal v7.4s, v15.4h, v5.4h\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal v27.4s, v4.4h, v5.4h\n"
+    "smlal v8.4s, v3.4h, v5.4h\n"
+    "smlal2 v24.4s, v15.8h, v23.8h\n"
+    "smlal v13.4s, v31.4h, v30.4h\n"
+    "smlal2 v14.4s, v15.8h, v5.8h\n"
+    "ldr d15, [x7, #0x88]\n"
+    "smlal2 v22.4s, v4.8h, v5.8h\n"
+    "usubl v15.8h, v15.8b, v2.8b\n"
+    "smlal2 v17.4s, v3.8h, v5.8h\n"
+    "ldr d5, [x9, x4]\n"
+    "smlal v7.4s, v31.4h, v23.4h\n"
+    "ushll v5.8h, v5.8b, #0x0\n"
+    "smlal v27.4s, v3.4h, v23.4h\n"
+    "smlal v8.4s, v0.4h, v23.4h\n"
+    "smlal2 v24.4s, v31.8h, v30.8h\n"
+    "smlal v13.4s, v29.4h, v28.4h\n"
+    "smlal2 v14.4s, v31.8h, v23.8h\n"
+    "ldr d31, [x7, #0x90]\n"
+    "smlal2 v22.4s, v3.8h, v23.8h\n"
+    "usubl v31.8h, v31.8b, v2.8b\n"
+    "smlal2 v17.4s, v0.8h, v23.8h\n"
+    "ldr d23, [x28, x4]\n"
+    "smlal v7.4s, v29.4h, v30.4h\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "smlal v27.4s, v0.4h, v30.4h\n"
+    "smlal v8.4s, v20.4h, v30.4h\n"
+    "smlal2 v24.4s, v29.8h, v28.8h\n"
+    "smlal v13.4s, v1.4h, v16.4h\n"
+    "smlal2 v14.4s, v29.8h, v30.8h\n"
+    "ldr d29, [x7, #0x98]\n"
+    "smlal2 v22.4s, v0.8h, v30.8h\n"
+    "usubl v29.8h, v29.8b, v2.8b\n"
+    "smlal2 v17.4s, v20.8h, v30.8h\n"
+    "ldr d30, [x27, x4]\n"
+    "smlal v7.4s, v1.4h, v28.4h\n"
+    "ushll v30.8h, v30.8b, #0x0\n"
+    "smlal v27.4s, v20.4h, v28.4h\n"
+    "smlal v8.4s, v6.4h, v28.4h\n"
+    "smlal2 v24.4s, v1.8h, v16.8h\n"
+    "smlal v13.4s, v4.4h, v21.4h\n"
+    "smlal2 v14.4s, v1.8h, v28.8h\n"
+    "ldr d1, [x7, #0xa0]\n"
+    "smlal2 v22.4s, v20.8h, v28.8h\n"
+    "usubl v1.8h, v1.8b, v2.8b\n"
+    "smlal2 v17.4s, v6.8h, v28.8h\n"
+    "ldr d28, [x26, x4]\n"
+    "smlal v7.4s, v18.4h, v16.4h\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "smlal v27.4s, v6.4h, v16.4h\n"
+    "smlal v8.4s, v19.4h, v16.4h\n"
+    "smlal2 v24.4s, v4.8h, v21.8h\n"
+    "ldr d4, [x7, #0xa8]\n"
+    "smlal v13.4s, v3.4h, v11.4h\n"
+    "usubl v4.8h, v4.8b, v2.8b\n"
+    "smlal2 v14.4s, v18.8h, v16.8h\n"
+    "ldr d18, [x25, x4]\n"
+    "smlal2 v22.4s, v6.8h, v16.8h\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "smlal2 v17.4s, v19.8h, v16.8h\n"
+    "ldr d16, [x24, x4]\n"
+    "smlal v7.4s, v3.4h, v21.4h\n"
+    "ushll v16.8h, v16.8b, #0x0\n"
+    "smlal v27.4s, v9.4h, v21.4h\n"
+    "smlal v8.4s, v10.4h, v21.4h\n"
+    "smlal2 v24.4s, v3.8h, v11.8h\n"
+    "smlal v13.4s, v0.4h, v15.4h\n"
+    "smlal2 v14.4s, v3.8h, v21.8h\n"
+    "ldr d3, [x7, #0xb0]\n"
+    "smlal2 v22.4s, v9.8h, v21.8h\n"
+    "usubl v3.8h, v3.8b, v2.8b\n"
+    "smlal2 v17.4s, v10.8h, v21.8h\n"
+    "ldr d21, [x23, x4]\n"
+    "smlal v7.4s, v0.4h, v11.4h\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "smlal v27.4s, v10.4h, v11.4h\n"
+    "smlal v8.4s, v5.4h, v11.4h\n"
+    "smlal2 v24.4s, v0.8h, v15.8h\n"
+    "smlal v13.4s, v20.4h, v31.4h\n"
+    "smlal2 v14.4s, v0.8h, v11.8h\n"
+    "ldr d0, [x7, #0xb8]\n"
+    "smlal2 v22.4s, v10.8h, v11.8h\n"
+    "usubl v0.8h, v0.8b, v2.8b\n"
+    "smlal2 v17.4s, v5.8h, v11.8h\n"
+    "ldr d11, [x22, x4]\n"
+    "smlal v7.4s, v20.4h, v15.4h\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "smlal v27.4s, v5.4h, v15.4h\n"
+    "smlal v8.4s, v23.4h, v15.4h\n"
+    "smlal2 v24.4s, v20.8h, v31.8h\n"
+    "smlal v13.4s, v6.4h, v29.4h\n"
+    "smlal2 v14.4s, v20.8h, v15.8h\n"
+    "ldr d20, [x7, #0xc0]\n"
+    "smlal2 v22.4s, v5.8h, v15.8h\n"
+    "usubl v20.8h, v20.8b, v2.8b\n"
+    "smlal2 v17.4s, v23.8h, v15.8h\n"
+    "ldr d15, [x21, x4]\n"
+    "smlal v7.4s, v6.4h, v31.4h\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal v27.4s, v23.4h, v31.4h\n"
+    "smlal v8.4s, v30.4h, v31.4h\n"
+    "smlal2 v24.4s, v6.8h, v29.8h\n"
+    "smlal v13.4s, v9.4h, v1.4h\n"
+    "smlal2 v14.4s, v6.8h, v31.8h\n"
+    "ldr d6, [x20, x4]\n"
+    "smlal2 v22.4s, v23.8h, v31.8h\n"
+    "ushll v6.8h, v6.8b, #0x0\n"
+    "smlal2 v17.4s, v30.8h, v31.8h\n"
+    "ldr q31, [x8, #0x0]\n"
+    "smlal v7.4s, v19.4h, v29.4h\n"
+    "add x4, x4, #0x8\n"
+    "smlal v27.4s, v30.4h, v29.4h\n"
+    "smlal v8.4s, v28.4h, v29.4h\n"
+    "smlal2 v24.4s, v9.8h, v1.8h\n"
+    "ldr q9, [x17, #0x0]\n"
+    "smlal v13.4s, v10.4h, v4.4h\n"
+    "smlal2 v14.4s, v19.8h, v29.8h\n"
+    "ldr q19, [x8, #0x10]\n"
+    "smlal2 v22.4s, v30.8h, v29.8h\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v17.4s, v28.8h, v29.8h\n"
+    "ldr q29, [x17, #0x10]\n"
+    "smlal v7.4s, v10.4h, v1.4h\n"
+    "add x17, x17, #0x20\n"
+    "smlal v27.4s, v18.4h, v1.4h\n"
+    "smlal v8.4s, v16.4h, v1.4h\n"
+    "smlal2 v24.4s, v10.8h, v4.8h\n"
+    "smlal v13.4s, v5.4h, v3.4h\n"
+    "smlal2 v14.4s, v10.8h, v1.8h\n"
+    "smlal2 v22.4s, v18.8h, v1.8h\n"
+    "smlal2 v17.4s, v16.8h, v1.8h\n"
+    "smlal v7.4s, v5.4h, v4.4h\n"
+    "smlal v27.4s, v16.4h, v4.4h\n"
+    "smlal v8.4s, v21.4h, v4.4h\n"
+    "smlal2 v24.4s, v5.8h, v3.8h\n"
+    "smlal v13.4s, v23.4h, v0.4h\n"
+    "smlal2 v14.4s, v5.8h, v4.8h\n"
+    "smlal2 v22.4s, v16.8h, v4.8h\n"
+    "smlal2 v17.4s, v21.8h, v4.8h\n"
+    "smlal v7.4s, v23.4h, v3.4h\n"
+    "smlal v27.4s, v21.4h, v3.4h\n"
+    "smlal v8.4s, v11.4h, v3.4h\n"
+    "smlal2 v24.4s, v23.8h, v0.8h\n"
+    "smlal v13.4s, v30.4h, v20.4h\n"
+    "sqrdmulh v13.4s, v13.4s, v31.4s\n"
+    "smlal2 v14.4s, v23.8h, v3.8h\n"
+    "smlal2 v22.4s, v21.8h, v3.8h\n"
+    "and v21.16b, v13.16b, v9.16b\n"
+    "smlal2 v17.4s, v11.8h, v3.8h\n"
+    "smlal v7.4s, v30.4h, v0.4h\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "smlal v27.4s, v11.4h, v0.4h\n"
+    "smlal v8.4s, v15.4h, v0.4h\n"
+    "sqadd v13.4s, v13.4s, v21.4s\n"
+    "smlal2 v24.4s, v30.8h, v20.8h\n"
+    "smlal2 v14.4s, v30.8h, v0.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v19.4s\n"
+    "smlal2 v22.4s, v11.8h, v0.8h\n"
+    "smlal2 v17.4s, v15.8h, v0.8h\n"
+    "and v16.16b, v24.16b, v29.16b\n"
+    "smlal v7.4s, v28.4h, v20.4h\n"
+    "smlal v27.4s, v15.4h, v20.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v31.4s\n"
+    "smlal v8.4s, v6.4h, v20.4h\n"
+    "smlal2 v14.4s, v28.8h, v20.8h\n"
+    "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+    "smlal2 v22.4s, v15.8h, v20.8h\n"
+    "smlal2 v17.4s, v6.8h, v20.8h\n"
+    "sqrdmulh v8.4s, v8.4s, v31.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v23.16b, v7.16b, v9.16b\n"
+    "sqrdmulh v14.4s, v14.4s, v19.4s\n"
+    "and v20.16b, v27.16b, v9.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v19.4s\n"
+    "and v3.16b, v8.16b, v9.16b\n"
+    "sqrdmulh v17.4s, v17.4s, v19.4s\n"
+    "sqadd v24.4s, v24.4s, v16.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v18.16b, v14.16b, v29.16b\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "and v19.16b, v22.16b, v29.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "and v30.16b, v17.16b, v29.16b\n"
+    "sqadd v7.4s, v7.4s, v23.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sqadd v8.4s, v8.4s, v3.4s\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "srshl v13.4s, v13.4s, v9.4s\n"
+    "srshl v7.4s, v7.4s, v9.4s\n"
+    "sqadd v14.4s, v14.4s, v18.4s\n"
+    "srshl v27.4s, v27.4s, v9.4s\n"
+    "sqadd v22.4s, v22.4s, v19.4s\n"
+    "srshl v8.4s, v8.4s, v9.4s\n"
+    "sqadd v17.4s, v17.4s, v30.4s\n"
+    "srshl v24.4s, v24.4s, v29.4s\n"
+    "sqxtn v13.4h, v13.4s\n"
+    "srshl v14.4s, v14.4s, v29.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v22.4s, v22.4s, v29.4s\n"
+    "sqxtn v27.4h, v27.4s\n"
+    "srshl v17.4s, v17.4s, v29.4s\n"
+    "sqxtn v8.4h, v8.4s\n"
+    "sqxtn2 v13.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v14.4s\n"
+    "sqxtn2 v27.8h, v22.4s\n"
+    "sqxtn2 v8.8h, v17.4s\n"
+    "sqadd v13.8h, v13.8h, v25.8h\n"
+    "sqadd v7.8h, v7.8h, v25.8h\n"
+    "sqadd v27.8h, v27.8h, v25.8h\n"
+    "sqadd v8.8h, v8.8h, v25.8h\n"
+    "smax v13.8h, v13.8h, v12.8h\n"
+    "smax v7.8h, v7.8h, v12.8h\n"
+    "smax v27.8h, v27.8h, v12.8h\n"
+    "smax v8.8h, v8.8h, v12.8h\n"
+    "smin v13.8h, v13.8h, v26.8h\n"
+    "smin v7.8h, v7.8h, v26.8h\n"
+    "smin v27.8h, v27.8h, v26.8h\n"
+    "smin v8.8h, v8.8h, v26.8h\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "str d13, [x16, x5]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str d7, [x15, x5]\n"
+    "uzp1 v8.16b, v8.16b, v8.16b\n"
+    "str d27, [x14, x5]\n"
+    "str d8, [x13, x5]\n"
+    "add x5, x5, #0x8\n"
+    "beq 124f\n"
+    "add x7, x7, #0xc8\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x2, #2, 5f\n"
+    "ld1 { v13.4s }, [x20], #0x10\n"
+    "tbz x2, #1, 4f\n"
+    "ld1 { v24.d }[0], [x20], #0x8\n"
+    "tbz x2, #0, 7f\n"
+    "ld1 { v24.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x2, #0, 7f\n"
+    "ld1 { v24.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x2, #1, 6f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz x2, #0, 7f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 7f\n"
+    "ld1 { v13.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d21, [x7, #0x0]\n"
+    "ldr d15, [x7, #0x8]\n"
+    "mov v7.16b, v13.16b\n"
+    "mov v14.16b, v24.16b\n"
+    "ldr d29, [x7, #0x10]\n"
+    "ldr d18, [x7, #0x18]\n"
+    "mov v27.16b, v13.16b\n"
+    "mov v22.16b, v24.16b\n"
+    "ldr d3, [x7, #0x20]\n"
+    "ldp x9, x28, [x6, #0x0]\n"
+    "mov v8.16b, v13.16b\n"
+    "mov v17.16b, v24.16b\n"
+    "ldp x27, x26, [x6, #0x10]\n"
+    "ldp x25, x24, [x6, #0x20]\n"
+    "usubl v21.8h, v21.8b, v2.8b\n"
+    "usubl v15.8h, v15.8b, v2.8b\n"
+    "ldp x23, x22, [x6, #0x30]\n"
+    "ldp x21, x20, [x6, #0x40]\n"
+    "usubl v29.8h, v29.8b, v2.8b\n"
+    "usubl v18.8h, v18.8b, v2.8b\n"
+    "usubl v3.8h, v3.8b, v2.8b\n"
+    "add x9, x9, x4\n"
+    "add x28, x28, x4\n"
+    "add x27, x27, x4\n"
+    "add x26, x26, x4\n"
+    "add x25, x25, x4\n"
+    "add x24, x24, x4\n"
+    "add x23, x23, x4\n"
+    "add x22, x22, x4\n"
+    "add x21, x21, x4\n"
+    "add x20, x20, x4\n"
+    "tbz x2, #2, 9f\n"
+    "ld1 { v10.s }[0], [x9], #0x4\n"
+    "ld1 { v16.s }[0], [x28], #0x4\n"
+    "ld1 { v23.s }[0], [x27], #0x4\n"
+    "ld1 { v30.s }[0], [x26], #0x4\n"
+    "ld1 { v4.s }[0], [x25], #0x4\n"
+    "ld1 { v28.s }[0], [x24], #0x4\n"
+    "ld1 { v31.s }[0], [x23], #0x4\n"
+    "ld1 { v1.s }[0], [x22], #0x4\n"
+    "ld1 { v9.s }[0], [x21], #0x4\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 8f\n"
+    "ld1 { v10.h }[2], [x9], #0x2\n"
+    "ld1 { v16.h }[2], [x28], #0x2\n"
+    "ld1 { v23.h }[2], [x27], #0x2\n"
+    "ld1 { v30.h }[2], [x26], #0x2\n"
+    "ld1 { v4.h }[2], [x25], #0x2\n"
+    "ld1 { v28.h }[2], [x24], #0x2\n"
+    "ld1 { v31.h }[2], [x23], #0x2\n"
+    "ld1 { v1.h }[2], [x22], #0x2\n"
+    "ld1 { v9.h }[2], [x21], #0x2\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 11f\n"
+    "ld1 { v10.b }[6], [x9]\n"
+    "ld1 { v16.b }[6], [x28]\n"
+    "ld1 { v23.b }[6], [x27]\n"
+    "ld1 { v30.b }[6], [x26]\n"
+    "ld1 { v4.b }[6], [x25]\n"
+    "ld1 { v28.b }[6], [x24]\n"
+    "ld1 { v31.b }[6], [x23]\n"
+    "ld1 { v1.b }[6], [x22]\n"
+    "ld1 { v9.b }[6], [x21]\n"
+    "ld1 { v11.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x2, #0, 11f\n"
+    "ld1 { v10.b }[4], [x9]\n"
+    "ld1 { v16.b }[4], [x28]\n"
+    "ld1 { v23.b }[4], [x27]\n"
+    "ld1 { v30.b }[4], [x26]\n"
+    "ld1 { v4.b }[4], [x25]\n"
+    "ld1 { v28.b }[4], [x24]\n"
+    "ld1 { v31.b }[4], [x23]\n"
+    "ld1 { v1.b }[4], [x22]\n"
+    "ld1 { v9.b }[4], [x21]\n"
+    "ld1 { v11.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x2, #1, 10f\n"
+    "ld1 { v10.h }[0], [x9], #0x2\n"
+    "ld1 { v16.h }[0], [x28], #0x2\n"
+    "ld1 { v23.h }[0], [x27], #0x2\n"
+    "ld1 { v30.h }[0], [x26], #0x2\n"
+    "ld1 { v4.h }[0], [x25], #0x2\n"
+    "ld1 { v28.h }[0], [x24], #0x2\n"
+    "ld1 { v31.h }[0], [x23], #0x2\n"
+    "ld1 { v1.h }[0], [x22], #0x2\n"
+    "ld1 { v9.h }[0], [x21], #0x2\n"
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 11f\n"
+    "ld1 { v10.b }[2], [x9]\n"
+    "ld1 { v16.b }[2], [x28]\n"
+    "ld1 { v23.b }[2], [x27]\n"
+    "ld1 { v30.b }[2], [x26]\n"
+    "ld1 { v4.b }[2], [x25]\n"
+    "ld1 { v28.b }[2], [x24]\n"
+    "ld1 { v31.b }[2], [x23]\n"
+    "ld1 { v1.b }[2], [x22]\n"
+    "ld1 { v9.b }[2], [x21]\n"
+    "ld1 { v11.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 11f\n"
+    "ld1 { v10.b }[0], [x9]\n"
+    "ld1 { v16.b }[0], [x28]\n"
+    "ld1 { v23.b }[0], [x27]\n"
+    "ld1 { v30.b }[0], [x26]\n"
+    "ld1 { v4.b }[0], [x25]\n"
+    "ld1 { v28.b }[0], [x24]\n"
+    "ld1 { v31.b }[0], [x23]\n"
+    "ld1 { v1.b }[0], [x22]\n"
+    "ld1 { v9.b }[0], [x21]\n"
+    "ld1 { v11.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ushll v16.8h, v16.8b, #0x0\n"
+    "smlal v13.4s, v10.4h, v21.4h\n"
+    "ldr x20, [x6, #0x50]\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "smlal2 v24.4s, v10.8h, v21.8h\n"
+    "smlal v7.4s, v16.4h, v21.4h\n"
+    "smlal2 v14.4s, v16.8h, v21.8h\n"
+    "smlal v27.4s, v23.4h, v21.4h\n"
+    "ushll v30.8h, v30.8b, #0x0\n"
+    "add x20, x20, x4\n"
+    "smlal2 v22.4s, v23.8h, v21.8h\n"
+    "ushll v4.8h, v4.8b, #0x0\n"
+    "smlal v8.4s, v30.4h, v21.4h\n"
+    "smlal2 v17.4s, v30.8h, v21.8h\n"
+    "smlal v13.4s, v16.4h, v15.4h\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "smlal2 v24.4s, v16.8h, v15.8h\n"
+    "smlal v7.4s, v4.4h, v15.4h\n"
+    "ushll v31.8h, v31.8b, #0x0\n"
+    "smlal2 v14.4s, v4.8h, v15.8h\n"
+    "smlal v27.4s, v30.4h, v15.4h\n"
+    "ushll v1.8h, v1.8b, #0x0\n"
+    "smlal2 v22.4s, v30.8h, v15.8h\n"
+    "ushll v9.8h, v9.8b, #0x0\n"
+    "smlal v8.4s, v28.4h, v15.4h\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "smlal2 v17.4s, v28.8h, v15.8h\n"
+    "smlal v13.4s, v4.4h, v29.4h\n"
+    "smlal2 v24.4s, v4.8h, v29.8h\n"
+    "smlal v7.4s, v31.4h, v29.4h\n"
+    "smlal2 v14.4s, v31.8h, v29.8h\n"
+    "smlal v27.4s, v28.4h, v29.4h\n"
+    "smlal2 v22.4s, v28.8h, v29.8h\n"
+    "tbz x2, #2, 13f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 12f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 15f\n"
+    "ld1 { v5.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 15f\n"
+    "ld1 { v5.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x2, #1, 14f\n"
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 15f\n"
+    "ld1 { v5.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 15f\n"
+    "ld1 { v5.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "ushll v5.8h, v5.8b, #0x0\n"
+    "ldr x20, [x6, #0x58]\n"
+    "smlal v8.4s, v5.4h, v29.4h\n"
+    "smlal2 v17.4s, v5.8h, v29.8h\n"
+    "smlal v13.4s, v31.4h, v18.4h\n"
+    "smlal2 v24.4s, v31.8h, v18.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v7.4s, v1.4h, v18.4h\n"
+    "smlal2 v14.4s, v1.8h, v18.8h\n"
+    "smlal v27.4s, v5.4h, v18.4h\n"
+    "smlal2 v22.4s, v5.8h, v18.8h\n"
+    "tbz x2, #2, 17f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 16f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 19f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 19f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x2, #1, 18f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 19f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 19f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ldr x20, [x6, #0x60]\n"
+    "smlal v8.4s, v10.4h, v18.4h\n"
+    "smlal2 v17.4s, v10.8h, v18.8h\n"
+    "smlal v13.4s, v1.4h, v3.4h\n"
+    "smlal2 v24.4s, v1.8h, v3.8h\n"
+    "add x20, x20, x4\n"
+    "tbz x2, #2, 21f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 20f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 23f\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 23f\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 5): Bit 2: Unset
+    "tbz x2, #1, 22f\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 23f\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 23f\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (0, 5): Bit 2: End
+    "ldr d6, [x7, #0x28]\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal v7.4s, v15.4h, v3.4h\n"
+    "smlal2 v14.4s, v15.8h, v3.8h\n"
+    "smlal v27.4s, v10.4h, v3.4h\n"
+    "smlal2 v22.4s, v10.8h, v3.8h\n"
+    "usubl v6.8h, v6.8b, v2.8b\n"
+    "ldr x20, [x6, #0x68]\n"
+    "smlal v8.4s, v9.4h, v3.4h\n"
+    "smlal2 v17.4s, v9.8h, v3.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v23.4h, v6.4h\n"
+    "smlal2 v24.4s, v23.8h, v6.8h\n"
+    "smlal v7.4s, v30.4h, v6.4h\n"
+    "smlal2 v14.4s, v30.8h, v6.8h\n"
+    "smlal v27.4s, v11.4h, v6.4h\n"
+    "smlal2 v22.4s, v11.8h, v6.8h\n"
+    "tbz x2, #2, 25f\n"
+    "ld1 { v20.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 24f\n"
+    "ld1 { v20.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 27f\n"
+    "ld1 { v20.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 27f\n"
+    "ld1 { v20.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x2, #1, 26f\n"
+    "ld1 { v20.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 27f\n"
+    "ld1 { v20.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 27f\n"
+    "ld1 { v20.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (2, 1): Bit 2: End
+    "ldr d4, [x7, #0x30]\n"
+    "ushll v20.8h, v20.8b, #0x0\n"
+    "usubl v4.8h, v4.8b, v2.8b\n"
+    "ldr x20, [x6, #0x70]\n"
+    "smlal v8.4s, v20.4h, v6.4h\n"
+    "smlal2 v17.4s, v20.8h, v6.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v30.4h, v4.4h\n"
+    "smlal2 v24.4s, v30.8h, v4.8h\n"
+    "smlal v7.4s, v28.4h, v4.4h\n"
+    "smlal2 v14.4s, v28.8h, v4.8h\n"
+    "smlal v27.4s, v20.4h, v4.4h\n"
+    "smlal2 v22.4s, v20.8h, v4.8h\n"
+    "tbz x2, #2, 29f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 28f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 31f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 31f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x2, #1, 30f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 31f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 31f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "ldr d30, [x7, #0x38]\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "usubl v30.8h, v30.8b, v2.8b\n"
+    "ldr x20, [x6, #0x78]\n"
+    "smlal v8.4s, v23.4h, v4.4h\n"
+    "smlal2 v17.4s, v23.8h, v4.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v28.4h, v30.4h\n"
+    "smlal2 v24.4s, v28.8h, v30.8h\n"
+    "smlal v7.4s, v5.4h, v30.4h\n"
+    "smlal2 v14.4s, v5.8h, v30.8h\n"
+    "smlal v27.4s, v23.4h, v30.4h\n"
+    "smlal2 v22.4s, v23.8h, v30.8h\n"
+    "tbz x2, #2, 33f\n"
+    "ld1 { v3.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 32f\n"
+    "ld1 { v3.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 35f\n"
+    "ld1 { v3.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 35f\n"
+    "ld1 { v3.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x2, #1, 34f\n"
+    "ld1 { v3.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 35f\n"
+    "ld1 { v3.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 35f\n"
+    "ld1 { v3.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (2, 3): Bit 2: End
+    "ldr d16, [x7, #0x40]\n"
+    "ushll v3.8h, v3.8b, #0x0\n"
+    "usubl v16.8h, v16.8b, v2.8b\n"
+    "ldr x20, [x6, #0x80]\n"
+    "smlal v8.4s, v3.4h, v30.4h\n"
+    "smlal2 v17.4s, v3.8h, v30.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v5.4h, v16.4h\n"
+    "smlal2 v24.4s, v5.8h, v16.8h\n"
+    "smlal v7.4s, v10.4h, v16.4h\n"
+    "smlal2 v14.4s, v10.8h, v16.8h\n"
+    "smlal v27.4s, v3.4h, v16.4h\n"
+    "smlal2 v22.4s, v3.8h, v16.8h\n"
+    "tbz x2, #2, 37f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 36f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 39f\n"
+    "ld1 { v6.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 39f\n"
+    "ld1 { v6.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x2, #1, 38f\n"
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 39f\n"
+    "ld1 { v6.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 39f\n"
+    "ld1 { v6.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 4): Bit 2: End
+    "ldr d1, [x7, #0x48]\n"
+    "ushll v6.8h, v6.8b, #0x0\n"
+    "usubl v1.8h, v1.8b, v2.8b\n"
+    "ldr x20, [x6, #0x88]\n"
+    "smlal v8.4s, v6.4h, v16.4h\n"
+    "smlal2 v17.4s, v6.8h, v16.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v10.4h, v1.4h\n"
+    "smlal2 v24.4s, v10.8h, v1.8h\n"
+    "smlal v7.4s, v9.4h, v1.4h\n"
+    "smlal2 v14.4s, v9.8h, v1.8h\n"
+    "smlal v27.4s, v6.4h, v1.4h\n"
+    "smlal2 v22.4s, v6.8h, v1.8h\n"
+    "tbz x2, #2, 41f\n"
+    "ld1 { v18.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 40f\n"
+    "ld1 { v18.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 43f\n"
+    "ld1 { v18.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 43f\n"
+    "ld1 { v18.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 5): Bit 2: Unset
+    "tbz x2, #1, 42f\n"
+    "ld1 { v18.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 43f\n"
+    "ld1 { v18.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 43f\n"
+    "ld1 { v18.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (2, 5): Bit 2: End
+    "ldr d28, [x7, #0x50]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "usubl v28.8h, v28.8b, v2.8b\n"
+    "ldr x20, [x6, #0x90]\n"
+    "smlal v8.4s, v18.4h, v1.4h\n"
+    "smlal2 v17.4s, v18.8h, v1.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v11.4h, v28.4h\n"
+    "smlal2 v24.4s, v11.8h, v28.8h\n"
+    "smlal v7.4s, v20.4h, v28.4h\n"
+    "smlal2 v14.4s, v20.8h, v28.8h\n"
+    "tbz x2, #2, 45f\n"
+    "ld1 { v30.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 44f\n"
+    "ld1 { v30.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 47f\n"
+    "ld1 { v30.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 47f\n"
+    "ld1 { v30.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x2, #1, 46f\n"
+    "ld1 { v30.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 47f\n"
+    "ld1 { v30.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 47f\n"
+    "ld1 { v30.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (3, 0): Bit 2: End
+    "ushll v30.8h, v30.8b, #0x0\n"
+    "ldr x20, [x6, #0x98]\n"
+    "smlal v27.4s, v30.4h, v28.4h\n"
+    "smlal2 v22.4s, v30.8h, v28.8h\n"
+    "add x20, x20, x4\n"
+    "tbz x2, #2, 49f\n"
+    "ld1 { v19.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 48f\n"
+    "ld1 { v19.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 51f\n"
+    "ld1 { v19.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 51f\n"
+    "ld1 { v19.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x2, #1, 50f\n"
+    "ld1 { v19.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 51f\n"
+    "ld1 { v19.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 51f\n"
+    "ld1 { v19.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "ldr d0, [x7, #0x58]\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "usubl v0.8h, v0.8b, v2.8b\n"
+    "ldr x20, [x6, #0xa0]\n"
+    "smlal v8.4s, v19.4h, v28.4h\n"
+    "smlal2 v17.4s, v19.8h, v28.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v20.4h, v0.4h\n"
+    "smlal2 v24.4s, v20.8h, v0.8h\n"
+    "smlal v7.4s, v23.4h, v0.4h\n"
+    "smlal2 v14.4s, v23.8h, v0.8h\n"
+    "smlal v27.4s, v19.4h, v0.4h\n"
+    "smlal2 v22.4s, v19.8h, v0.8h\n"
+    "tbz x2, #2, 53f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 52f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 55f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 55f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x2, #1, 54f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 55f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 55f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "ldr d10, [x7, #0x60]\n"
+    "ushll v9.8h, v9.8b, #0x0\n"
+    "usubl v10.8h, v10.8b, v2.8b\n"
+    "ldr x20, [x6, #0xa8]\n"
+    "smlal v8.4s, v9.4h, v0.4h\n"
+    "smlal2 v17.4s, v9.8h, v0.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v23.4h, v10.4h\n"
+    "smlal2 v24.4s, v23.8h, v10.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "smlal2 v14.4s, v3.8h, v10.8h\n"
+    "smlal v27.4s, v9.4h, v10.4h\n"
+    "smlal2 v22.4s, v9.8h, v10.8h\n"
+    "tbz x2, #2, 57f\n"
+    "ld1 { v20.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 56f\n"
+    "ld1 { v20.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 59f\n"
+    "ld1 { v20.b }[6], [x20]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 59f\n"
+    "ld1 { v20.b }[4], [x20]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x2, #1, 58f\n"
+    "ld1 { v20.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 59f\n"
+    "ld1 { v20.b }[2], [x20]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 59f\n"
+    "ld1 { v20.b }[0], [x20]\n"
+    "59:"  // Oddments: Load (3, 3): Bit 2: End
+    "ldr d28, [x7, #0x68]\n"
+    "ushll v20.8h, v20.8b, #0x0\n"
+    "usubl v28.8h, v28.8b, v2.8b\n"
+    "ldr x20, [x6, #0xb0]\n"
+    "smlal v8.4s, v20.4h, v10.4h\n"
+    "smlal2 v17.4s, v20.8h, v10.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v3.4h, v28.4h\n"
+    "smlal2 v24.4s, v3.8h, v28.8h\n"
+    "smlal v7.4s, v6.4h, v28.4h\n"
+    "smlal2 v14.4s, v6.8h, v28.8h\n"
+    "smlal v27.4s, v20.4h, v28.4h\n"
+    "smlal2 v22.4s, v20.8h, v28.8h\n"
+    "tbz x2, #2, 61f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 60f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 63f\n"
+    "ld1 { v5.b }[6], [x20]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 63f\n"
+    "ld1 { v5.b }[4], [x20]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x2, #1, 62f\n"
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 63f\n"
+    "ld1 { v5.b }[2], [x20]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 63f\n"
+    "ld1 { v5.b }[0], [x20]\n"
+    "63:"  // Oddments: Load (3, 4): Bit 2: End
+    "ldr d23, [x7, #0x70]\n"
+    "ushll v5.8h, v5.8b, #0x0\n"
+    "usubl v23.8h, v23.8b, v2.8b\n"
+    "ldr x20, [x6, #0xb8]\n"
+    "smlal v8.4s, v5.4h, v28.4h\n"
+    "smlal2 v17.4s, v5.8h, v28.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v6.4h, v23.4h\n"
+    "smlal2 v24.4s, v6.8h, v23.8h\n"
+    "smlal v7.4s, v18.4h, v23.4h\n"
+    "smlal2 v14.4s, v18.8h, v23.8h\n"
+    "smlal v27.4s, v5.4h, v23.4h\n"
+    "smlal2 v22.4s, v5.8h, v23.8h\n"
+    "tbz x2, #2, 65f\n"
+    "ld1 { v29.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 64f\n"
+    "ld1 { v29.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 67f\n"
+    "ld1 { v29.b }[6], [x20]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 67f\n"
+    "ld1 { v29.b }[4], [x20]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 5): Bit 2: Unset
+    "tbz x2, #1, 66f\n"
+    "ld1 { v29.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 67f\n"
+    "ld1 { v29.b }[2], [x20]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 67f\n"
+    "ld1 { v29.b }[0], [x20]\n"
+    "67:"  // Oddments: Load (3, 5): Bit 2: End
+    "ldr d4, [x7, #0x78]\n"
+    "ushll v29.8h, v29.8b, #0x0\n"
+    "usubl v4.8h, v4.8b, v2.8b\n"
+    "ldr x20, [x6, #0xc0]\n"
+    "smlal v8.4s, v29.4h, v23.4h\n"
+    "smlal2 v17.4s, v29.8h, v23.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v30.4h, v4.4h\n"
+    "smlal2 v24.4s, v30.8h, v4.8h\n"
+    "smlal v7.4s, v19.4h, v4.4h\n"
+    "smlal2 v14.4s, v19.8h, v4.8h\n"
+    "tbz x2, #2, 69f\n"
+    "ld1 { v18.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 68f\n"
+    "ld1 { v18.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 71f\n"
+    "ld1 { v18.b }[6], [x20]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 71f\n"
+    "ld1 { v18.b }[4], [x20]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x2, #1, 70f\n"
+    "ld1 { v18.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 71f\n"
+    "ld1 { v18.b }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 71f\n"
+    "ld1 { v18.b }[0], [x20]\n"
+    "71:"  // Oddments: Load (4, 0): Bit 2: End
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "ldr x20, [x6, #0xc8]\n"
+    "smlal v27.4s, v18.4h, v4.4h\n"
+    "smlal2 v22.4s, v18.8h, v4.8h\n"
+    "add x20, x20, x4\n"
+    "tbz x2, #2, 73f\n"
+    "ld1 { v1.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 72f\n"
+    "ld1 { v1.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 75f\n"
+    "ld1 { v1.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 75f\n"
+    "ld1 { v1.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x2, #1, 74f\n"
+    "ld1 { v1.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 75f\n"
+    "ld1 { v1.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 75f\n"
+    "ld1 { v1.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 1): Bit 2: End
+    "ldr d23, [x7, #0x80]\n"
+    "ushll v1.8h, v1.8b, #0x0\n"
+    "usubl v23.8h, v23.8b, v2.8b\n"
+    "ldr x20, [x6, #0xd0]\n"
+    "smlal v8.4s, v1.4h, v4.4h\n"
+    "smlal2 v17.4s, v1.8h, v4.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v19.4h, v23.4h\n"
+    "smlal2 v24.4s, v19.8h, v23.8h\n"
+    "smlal v7.4s, v9.4h, v23.4h\n"
+    "smlal2 v14.4s, v9.8h, v23.8h\n"
+    "smlal v27.4s, v1.4h, v23.4h\n"
+    "smlal2 v22.4s, v1.8h, v23.8h\n"
+    "tbz x2, #2, 77f\n"
+    "ld1 { v4.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 76f\n"
+    "ld1 { v4.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 79f\n"
+    "ld1 { v4.b }[6], [x20]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 79f\n"
+    "ld1 { v4.b }[4], [x20]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x2, #1, 78f\n"
+    "ld1 { v4.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 79f\n"
+    "ld1 { v4.b }[2], [x20]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 79f\n"
+    "ld1 { v4.b }[0], [x20]\n"
+    "79:"  // Oddments: Load (4, 2): Bit 2: End
+    "ldr d30, [x7, #0x88]\n"
+    "ushll v4.8h, v4.8b, #0x0\n"
+    "usubl v30.8h, v30.8b, v2.8b\n"
+    "ldr x20, [x6, #0xd8]\n"
+    "smlal v8.4s, v4.4h, v23.4h\n"
+    "smlal2 v17.4s, v4.8h, v23.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v9.4h, v30.4h\n"
+    "smlal2 v24.4s, v9.8h, v30.8h\n"
+    "smlal v7.4s, v20.4h, v30.4h\n"
+    "smlal2 v14.4s, v20.8h, v30.8h\n"
+    "smlal v27.4s, v4.4h, v30.4h\n"
+    "smlal2 v22.4s, v4.8h, v30.8h\n"
+    "tbz x2, #2, 81f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 80f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 83f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 83f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x2, #1, 82f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 83f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 83f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "83:"  // Oddments: Load (4, 3): Bit 2: End
+    "ldr d3, [x7, #0x90]\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "usubl v3.8h, v3.8b, v2.8b\n"
+    "ldr x20, [x6, #0xe0]\n"
+    "smlal v8.4s, v21.4h, v30.4h\n"
+    "smlal2 v17.4s, v21.8h, v30.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v20.4h, v3.4h\n"
+    "smlal2 v24.4s, v20.8h, v3.8h\n"
+    "smlal v7.4s, v5.4h, v3.4h\n"
+    "smlal2 v14.4s, v5.8h, v3.8h\n"
+    "smlal v27.4s, v21.4h, v3.4h\n"
+    "smlal2 v22.4s, v21.8h, v3.8h\n"
+    "tbz x2, #2, 85f\n"
+    "ld1 { v30.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 84f\n"
+    "ld1 { v30.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 87f\n"
+    "ld1 { v30.b }[6], [x20]\n"
+    "b 87f\n"
+    "84:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 87f\n"
+    "ld1 { v30.b }[4], [x20]\n"
+    "b 87f\n"
+    "85:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x2, #1, 86f\n"
+    "ld1 { v30.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 87f\n"
+    "ld1 { v30.b }[2], [x20]\n"
+    "b 87f\n"
+    "86:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 87f\n"
+    "ld1 { v30.b }[0], [x20]\n"
+    "87:"  // Oddments: Load (4, 4): Bit 2: End
+    "ldr d19, [x7, #0x98]\n"
+    "ushll v30.8h, v30.8b, #0x0\n"
+    "usubl v19.8h, v19.8b, v2.8b\n"
+    "ldr x20, [x6, #0xe8]\n"
+    "smlal v8.4s, v30.4h, v3.4h\n"
+    "smlal2 v17.4s, v30.8h, v3.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v5.4h, v19.4h\n"
+    "smlal2 v24.4s, v5.8h, v19.8h\n"
+    "smlal v7.4s, v29.4h, v19.4h\n"
+    "smlal2 v14.4s, v29.8h, v19.8h\n"
+    "smlal v27.4s, v30.4h, v19.4h\n"
+    "smlal2 v22.4s, v30.8h, v19.8h\n"
+    "tbz x2, #2, 89f\n"
+    "ld1 { v20.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 88f\n"
+    "ld1 { v20.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 91f\n"
+    "ld1 { v20.b }[6], [x20]\n"
+    "b 91f\n"
+    "88:"  // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 91f\n"
+    "ld1 { v20.b }[4], [x20]\n"
+    "b 91f\n"
+    "89:"  // Oddments: Load (4, 5): Bit 2: Unset
+    "tbz x2, #1, 90f\n"
+    "ld1 { v20.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 91f\n"
+    "ld1 { v20.b }[2], [x20]\n"
+    "b 91f\n"
+    "90:"  // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 91f\n"
+    "ld1 { v20.b }[0], [x20]\n"
+    "91:"  // Oddments: Load (4, 5): Bit 2: End
+    "ldr d23, [x7, #0xa0]\n"
+    "ushll v20.8h, v20.8b, #0x0\n"
+    "usubl v23.8h, v23.8b, v2.8b\n"
+    "ldr x20, [x6, #0xf0]\n"
+    "smlal v8.4s, v20.4h, v19.4h\n"
+    "smlal2 v17.4s, v20.8h, v19.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v18.4h, v23.4h\n"
+    "smlal2 v24.4s, v18.8h, v23.8h\n"
+    "smlal v7.4s, v1.4h, v23.4h\n"
+    "smlal2 v14.4s, v1.8h, v23.8h\n"
+    "tbz x2, #2, 93f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 92f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 95f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 95f\n"
+    "92:"  // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 95f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 95f\n"
+    "93:"  // Oddments: Load (5, 0): Bit 2: Unset
+    "tbz x2, #1, 94f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 95f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 95f\n"
+    "94:"  // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 95f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "95:"  // Oddments: Load (5, 0): Bit 2: End
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ldr x20, [x6, #0xf8]\n"
+    "smlal v27.4s, v10.4h, v23.4h\n"
+    "smlal2 v22.4s, v10.8h, v23.8h\n"
+    "add x20, x20, x4\n"
+    "tbz x2, #2, 97f\n"
+    "ld1 { v18.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 96f\n"
+    "ld1 { v18.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 99f\n"
+    "ld1 { v18.b }[6], [x20]\n"
+    "b 99f\n"
+    "96:"  // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 99f\n"
+    "ld1 { v18.b }[4], [x20]\n"
+    "b 99f\n"
+    "97:"  // Oddments: Load (5, 1): Bit 2: Unset
+    "tbz x2, #1, 98f\n"
+    "ld1 { v18.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 99f\n"
+    "ld1 { v18.b }[2], [x20]\n"
+    "b 99f\n"
+    "98:"  // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 99f\n"
+    "ld1 { v18.b }[0], [x20]\n"
+    "99:"  // Oddments: Load (5, 1): Bit 2: End
+    "ldr d5, [x7, #0xa8]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "usubl v5.8h, v5.8b, v2.8b\n"
+    "ldr x20, [x6, #0x100]\n"
+    "smlal v8.4s, v18.4h, v23.4h\n"
+    "smlal2 v17.4s, v18.8h, v23.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v1.4h, v5.4h\n"
+    "smlal2 v24.4s, v1.8h, v5.8h\n"
+    "smlal v7.4s, v4.4h, v5.4h\n"
+    "smlal2 v14.4s, v4.8h, v5.8h\n"
+    "smlal v27.4s, v18.4h, v5.4h\n"
+    "smlal2 v22.4s, v18.8h, v5.8h\n"
+    "tbz x2, #2, 101f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 100f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 103f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 103f\n"
+    "100:"  // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 103f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 103f\n"
+    "101:"  // Oddments: Load (5, 2): Bit 2: Unset
+    "tbz x2, #1, 102f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 103f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 103f\n"
+    "102:"  // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 103f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "103:"  // Oddments: Load (5, 2): Bit 2: End
+    "ldr d18, [x7, #0xb0]\n"
+    "ushll v9.8h, v9.8b, #0x0\n"
+    "usubl v18.8h, v18.8b, v2.8b\n"
+    "ldr x20, [x6, #0x108]\n"
+    "smlal v8.4s, v9.4h, v5.4h\n"
+    "smlal2 v17.4s, v9.8h, v5.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v4.4h, v18.4h\n"
+    "smlal2 v24.4s, v4.8h, v18.8h\n"
+    "smlal v7.4s, v21.4h, v18.4h\n"
+    "smlal2 v14.4s, v21.8h, v18.8h\n"
+    "smlal v27.4s, v9.4h, v18.4h\n"
+    "smlal2 v22.4s, v9.8h, v18.8h\n"
+    "tbz x2, #2, 105f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 104f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 107f\n"
+    "ld1 { v5.b }[6], [x20]\n"
+    "b 107f\n"
+    "104:"  // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 107f\n"
+    "ld1 { v5.b }[4], [x20]\n"
+    "b 107f\n"
+    "105:"  // Oddments: Load (5, 3): Bit 2: Unset
+    "tbz x2, #1, 106f\n"
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 107f\n"
+    "ld1 { v5.b }[2], [x20]\n"
+    "b 107f\n"
+    "106:"  // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 107f\n"
+    "ld1 { v5.b }[0], [x20]\n"
+    "107:"  // Oddments: Load (5, 3): Bit 2: End
+    "ldr d11, [x7, #0xb8]\n"
+    "ushll v5.8h, v5.8b, #0x0\n"
+    "usubl v11.8h, v11.8b, v2.8b\n"
+    "ldr x20, [x6, #0x110]\n"
+    "smlal v8.4s, v5.4h, v18.4h\n"
+    "smlal2 v17.4s, v5.8h, v18.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v21.4h, v11.4h\n"
+    "smlal2 v24.4s, v21.8h, v11.8h\n"
+    "smlal v7.4s, v30.4h, v11.4h\n"
+    "smlal2 v14.4s, v30.8h, v11.8h\n"
+    "smlal v27.4s, v5.4h, v11.4h\n"
+    "smlal2 v22.4s, v5.8h, v11.8h\n"
+    "tbz x2, #2, 109f\n"
+    "ld1 { v18.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 108f\n"
+    "ld1 { v18.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 111f\n"
+    "ld1 { v18.b }[6], [x20]\n"
+    "b 111f\n"
+    "108:"  // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 111f\n"
+    "ld1 { v18.b }[4], [x20]\n"
+    "b 111f\n"
+    "109:"  // Oddments: Load (5, 4): Bit 2: Unset
+    "tbz x2, #1, 110f\n"
+    "ld1 { v18.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 111f\n"
+    "ld1 { v18.b }[2], [x20]\n"
+    "b 111f\n"
+    "110:"  // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 111f\n"
+    "ld1 { v18.b }[0], [x20]\n"
+    "111:"  // Oddments: Load (5, 4): Bit 2: End
+    "ldr d16, [x7, #0xc0]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "usubl v16.8h, v16.8b, v2.8b\n"
+    "ldr x20, [x6, #0x118]\n"
+    "smlal v8.4s, v18.4h, v11.4h\n"
+    "smlal2 v17.4s, v18.8h, v11.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v30.4h, v16.4h\n"
+    "smlal2 v24.4s, v30.8h, v16.8h\n"
+    "smlal v7.4s, v20.4h, v16.4h\n"
+    "smlal2 v14.4s, v20.8h, v16.8h\n"
+    "smlal v27.4s, v18.4h, v16.4h\n"
+    "smlal2 v22.4s, v18.8h, v16.8h\n"
+    "tbz x2, #2, 113f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 112f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 115f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 115f\n"
+    "112:"  // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 115f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 115f\n"
+    "113:"  // Oddments: Load (5, 5): Bit 2: Unset
+    "tbz x2, #1, 114f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 115f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 115f\n"
+    "114:"  // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 115f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "115:"  // Oddments: Load (5, 5): Bit 2: End
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "smlal v8.4s, v21.4h, v16.4h\n"
+    "smlal2 v17.4s, v21.8h, v16.8h\n"
+    "tbz x2, #2, 117f\n"
+    "ld1 { v16.4s }, [x8], #0x10\n"
+    "ld1 { v21.4s }, [x17], #0x10\n"
+    "tbz x2, #1, 116f\n"
+    "ld1 { v18.d }[0], [x8], #0x8\n"
+    "ld1 { v0.d }[0], [x17], #0x8\n"
+    "tbz x2, #0, 119f\n"
+    "ld1 { v18.s }[2], [x8]\n"
+    "ld1 { v0.s }[2], [x17]\n"
+    "b 119f\n"
+    "116:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x2, #0, 119f\n"
+    "ld1 { v18.s }[0], [x8]\n"
+    "ld1 { v0.s }[0], [x17]\n"
+    "b 119f\n"
+    "117:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x2, #1, 118f\n"
+    "ld1 { v16.d }[0], [x8], #0x8\n"
+    "ld1 { v21.d }[0], [x17], #0x8\n"
+    "tbz x2, #0, 119f\n"
+    "ld1 { v16.s }[2], [x8]\n"
+    "ld1 { v21.s }[2], [x17]\n"
+    "b 119f\n"
+    "118:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 119f\n"
+    "ld1 { v16.s }[0], [x8]\n"
+    "ld1 { v21.s }[0], [x17]\n"
+    "119:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v13.4s, v13.4s, v16.4s\n"
+    "and v5.16b, v13.16b, v21.16b\n"
+    "add x16, x16, x5\n"
+    "add x15, x15, x5\n"
+    "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "add x14, x14, x5\n"
+    "add x13, x13, x5\n"
+    "and v2.16b, v24.16b, v0.16b\n"
+    "sqrdmulh v7.4s, v7.4s, v16.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v16.4s\n"
+    "sqrdmulh v8.4s, v8.4s, v16.4s\n"
+    "sqadd v13.4s, v13.4s, v5.4s\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "and v23.16b, v7.16b, v21.16b\n"
+    "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+    "and v20.16b, v27.16b, v21.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v18.4s\n"
+    "and v31.16b, v8.16b, v21.16b\n"
+    "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v2.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v18.16b, v14.16b, v0.16b\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "and v11.16b, v22.16b, v0.16b\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v10.16b, v17.16b, v0.16b\n"
+    "sqadd v7.4s, v7.4s, v23.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "sqadd v8.4s, v8.4s, v31.4s\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "srshl v13.4s, v13.4s, v21.4s\n"
+    "srshl v7.4s, v7.4s, v21.4s\n"
+    "sqadd v14.4s, v14.4s, v18.4s\n"
+    "srshl v27.4s, v27.4s, v21.4s\n"
+    "sqadd v22.4s, v22.4s, v11.4s\n"
+    "srshl v8.4s, v8.4s, v21.4s\n"
+    "sqadd v17.4s, v17.4s, v10.4s\n"
+    "srshl v24.4s, v24.4s, v0.4s\n"
+    "sqxtn v13.4h, v13.4s\n"
+    "srshl v14.4s, v14.4s, v0.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v22.4s, v22.4s, v0.4s\n"
+    "sqxtn v27.4h, v27.4s\n"
+    "srshl v17.4s, v17.4s, v0.4s\n"
+    "sqxtn v8.4h, v8.4s\n"
+    "sqxtn2 v13.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v14.4s\n"
+    "sqxtn2 v27.8h, v22.4s\n"
+    "sqxtn2 v8.8h, v17.4s\n"
+    "sqadd v13.8h, v13.8h, v25.8h\n"
+    "sqadd v7.8h, v7.8h, v25.8h\n"
+    "sqadd v27.8h, v27.8h, v25.8h\n"
+    "sqadd v8.8h, v8.8h, v25.8h\n"
+    "smax v13.8h, v13.8h, v12.8h\n"
+    "smax v7.8h, v7.8h, v12.8h\n"
+    "smax v27.8h, v27.8h, v12.8h\n"
+    "smax v8.8h, v8.8h, v12.8h\n"
+    "smin v13.8h, v13.8h, v26.8h\n"
+    "smin v7.8h, v7.8h, v26.8h\n"
+    "smin v27.8h, v27.8h, v26.8h\n"
+    "smin v8.8h, v8.8h, v26.8h\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v8.16b, v8.16b, v8.16b\n"
+    "tbz x2, #2, 121f\n"
+    "st1 { v13.s }[0], [x16], #0x4\n"
+    "st1 { v7.s }[0], [x15], #0x4\n"
+    "st1 { v27.s }[0], [x14], #0x4\n"
+    "st1 { v8.s }[0], [x13], #0x4\n"
+    "tbz x2, #1, 120f\n"
+    "st1 { v13.h }[2], [x16], #0x2\n"
+    "st1 { v7.h }[2], [x15], #0x2\n"
+    "st1 { v27.h }[2], [x14], #0x2\n"
+    "st1 { v8.h }[2], [x13], #0x2\n"
+    "tbz x2, #0, 123f\n"
+    "st1 { v13.b }[6], [x16], #0x1\n"
+    "st1 { v7.b }[6], [x15], #0x1\n"
+    "st1 { v27.b }[6], [x14], #0x1\n"
+    "st1 { v8.b }[6], [x13], #0x1\n"
+    "b 123f\n"
+    "120:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x2, #0, 123f\n"
+    "st1 { v13.b }[4], [x16], #0x1\n"
+    "st1 { v7.b }[4], [x15], #0x1\n"
+    "st1 { v27.b }[4], [x14], #0x1\n"
+    "st1 { v8.b }[4], [x13], #0x1\n"
+    "b 123f\n"
+    "121:"  // Oddments: Bit 2: Unset
+    "tbz x2, #1, 122f\n"
+    "st1 { v13.h }[0], [x16], #0x2\n"
+    "st1 { v7.h }[0], [x15], #0x2\n"
+    "st1 { v27.h }[0], [x14], #0x2\n"
+    "st1 { v8.h }[0], [x13], #0x2\n"
+    "tbz x2, #0, 123f\n"
+    "st1 { v13.b }[2], [x16], #0x1\n"
+    "st1 { v7.b }[2], [x15], #0x1\n"
+    "st1 { v27.b }[2], [x14], #0x1\n"
+    "st1 { v8.b }[2], [x13], #0x1\n"
+    "b 123f\n"
+    "122:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 123f\n"
+    "st1 { v13.b }[0], [x16], #0x1\n"
+    "st1 { v7.b }[0], [x15], #0x1\n"
+    "st1 { v27.b }[0], [x14], #0x1\n"
+    "st1 { v8.b }[0], [x13], #0x1\n"
+    "123:"  // Oddments: Bit 2: End
+    "124:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..1666c17ca0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const int8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..f1c1b2315c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1166 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x8, x7, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v14.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v19.16b }, [x21]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v29.8h }, [x21]\n"
+    "ld1r { v12.8h }, [x20]\n"
+    "mov x17, #0x0\n"
+    "mov x16, #0x0\n"
+    "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x11, x10, [x22, #0x0]\n"
+    "ldp x9, x28, [x22, #0x10]\n"
+    "cbz x8, 3f\n"
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "subs x8, x8, #0x1\n"
+    "ssubl v23.8h, v23.8b, v19.8b\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "ssubl v16.8h, v16.8b, v19.8b\n"
+    "ssubl v1.8h, v1.8b, v19.8b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "ssubl v5.8h, v5.8b, v19.8b\n"
+    "ssubl v26.8h, v26.8b, v19.8b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "ssubl v18.8h, v18.8b, v19.8b\n"
+    "ssubl v31.8h, v31.8b, v19.8b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ssubl v25.8h, v25.8b, v19.8b\n"
+    "ssubl v20.8h, v20.8b, v19.8b\n"
+    "ldr q9, [x20, #0x0]\n"
+    "ldr q24, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x23, x22, [x15, #0x0]\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d22, [x23, x17]\n"
+    "ldr d4, [x22, x17]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d8, [x21, x17]\n"
+    "ldr d27, [x20, x17]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ldr d15, [x20, x17]\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr q3, [x13, #0x0]\n"
+    "ldr q17, [x12, #0x0]\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr q21, [x13, #0x10]\n"
+    "ldr q28, [x12, #0x10]\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "ldr d11, [x20, x17]\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "ldr x20, [x15, #0x38]\n"
+    "ldr d4, [x20, x17]\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "usubl v11.8h, v11.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "ldr x27, [x15, #0x48]\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x26, [x15, #0x50]\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "ldr d8, [x20, x17]\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "ldr x25, [x15, #0x58]\n"
+    "ldr x24, [x15, #0x60]\n"
+    "smlal v2.4s, v11.4h, v31.4h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x70]\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "smlal v9.4s, v4.4h, v16.4h\n"
+    "ldr x21, [x15, #0x78]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "ldr d27, [x27, x17]\n"
+    "smlal2 v30.4s, v11.8h, v31.8h\n"
+    "ldr d11, [x26, x17]\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "add x14, x14, #0x48\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal v10.4s, v22.4h, v20.4h\n"
+    "usubl v11.8h, v11.8b, v14.8b\n"
+    "subs x8, x8, #0x1\n"
+    "smlal2 v24.4s, v4.8h, v16.8h\n"
+    "smlal v9.4s, v8.4h, v1.4h\n"
+    "add x13, x13, #0x20\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "ldr d15, [x25, x17]\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v20.8h\n"
+    "ldr d22, [x24, x17]\n"
+    "smlal v7.4s, v4.4h, v23.4h\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v2.4s, v27.4h, v18.4h\n"
+    "smlal v10.4s, v27.4h, v26.4h\n"
+    "smlal2 v24.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v20.4h\n"
+    "smlal2 v0.4s, v4.8h, v23.8h\n"
+    "ldr d4, [x23, x17]\n"
+    "smlal2 v30.4s, v27.8h, v18.8h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v26.8h\n"
+    "ldr d26, [x22, x17]\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "usubl v26.8h, v26.8b, v14.8b\n"
+    "smlal v2.4s, v11.4h, v23.4h\n"
+    "smlal v10.4s, v15.4h, v1.4h\n"
+    "smlal2 v24.4s, v27.8h, v20.8h\n"
+    "smlal v9.4s, v11.4h, v5.4h\n"
+    "smlal2 v0.4s, v8.8h, v16.8h\n"
+    "ldr d8, [x21, x17]\n"
+    "smlal2 v30.4s, v11.8h, v23.8h\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v15.8h, v1.8h\n"
+    "smlal v7.4s, v27.4h, v25.4h\n"
+    "add x17, x17, #0x8\n"
+    "smlal v2.4s, v22.4h, v5.4h\n"
+    "smlal v10.4s, v4.4h, v18.4h\n"
+    "smlal2 v24.4s, v11.8h, v5.8h\n"
+    "smlal v9.4s, v22.4h, v31.4h\n"
+    "sqrdmulh v9.4s, v9.4s, v3.4s\n"
+    "smlal2 v0.4s, v27.8h, v25.8h\n"
+    "smlal2 v30.4s, v22.8h, v5.8h\n"
+    "and v27.16b, v9.16b, v17.16b\n"
+    "smlal2 v6.4s, v4.8h, v18.8h\n"
+    "smlal v7.4s, v15.4h, v18.4h\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "smlal v2.4s, v26.4h, v25.4h\n"
+    "smlal v10.4s, v26.4h, v31.4h\n"
+    "sqadd v9.4s, v9.4s, v27.4s\n"
+    "smlal2 v24.4s, v22.8h, v31.8h\n"
+    "smlal2 v0.4s, v15.8h, v18.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    "smlal2 v30.4s, v26.8h, v25.8h\n"
+    "smlal2 v6.4s, v26.8h, v31.8h\n"
+    "and v31.16b, v24.16b, v28.16b\n"
+    "smlal v7.4s, v4.4h, v20.4h\n"
+    "smlal v2.4s, v8.4h, v20.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v3.4s\n"
+    "smlal v10.4s, v8.4h, v25.4h\n"
+    "smlal2 v0.4s, v4.8h, v20.8h\n"
+    "sqrdmulh v2.4s, v2.4s, v3.4s\n"
+    "smlal2 v30.4s, v8.8h, v20.8h\n"
+    "smlal2 v6.4s, v8.8h, v25.8h\n"
+    "sqrdmulh v10.4s, v10.4s, v3.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v22.16b, v7.16b, v17.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+    "and v3.16b, v2.16b, v17.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "and v11.16b, v10.16b, v17.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v31.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v20.16b, v0.16b, v28.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "and v31.16b, v30.16b, v28.16b\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "and v18.16b, v6.16b, v28.16b\n"
+    "sqadd v7.4s, v7.4s, v22.4s\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v3.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v11.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v17.4s\n"
+    "srshl v7.4s, v7.4s, v17.4s\n"
+    "sqadd v0.4s, v0.4s, v20.4s\n"
+    "srshl v2.4s, v2.4s, v17.4s\n"
+    "sqadd v30.4s, v30.4s, v31.4s\n"
+    "srshl v10.4s, v10.4s, v17.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v28.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v28.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v28.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "str d9, [x11, x16]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "str d7, [x10, x16]\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "str d2, [x9, x16]\n"
+    "str d10, [x28, x16]\n"
+    "ldr q9, [x20, #0x0]\n"
+    "ldr q24, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "add x16, x16, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldp x23, x22, [x15, #0x0]\n"
+    "ssubl v23.8h, v23.8b, v19.8b\n"
+    "ssubl v16.8h, v16.8b, v19.8b\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr d22, [x23, x17]\n"
+    "ssubl v1.8h, v1.8b, v19.8b\n"
+    "ssubl v5.8h, v5.8b, v19.8b\n"
+    "ldr d4, [x22, x17]\n"
+    "ldr d8, [x21, x17]\n"
+    "ssubl v26.8h, v26.8b, v19.8b\n"
+    "ssubl v18.8h, v18.8b, v19.8b\n"
+    "ldr d27, [x20, x17]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ssubl v31.8h, v31.8b, v19.8b\n"
+    "ssubl v25.8h, v25.8b, v19.8b\n"
+    "ldr d15, [x20, x17]\n"
+    "ssubl v20.8h, v20.8b, v19.8b\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q28, [x13, #0x0]\n"
+    "ldr q17, [x12, #0x0]\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr q21, [x13, #0x10]\n"
+    "ldr q3, [x12, #0x10]\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "ldr d11, [x20, x17]\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "ldr x20, [x15, #0x38]\n"
+    "ldr d4, [x20, x17]\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "usubl v11.8h, v11.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "ldr x26, [x15, #0x48]\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x25, [x15, #0x50]\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "ldr d8, [x20, x17]\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "ldr x24, [x15, #0x58]\n"
+    "ldr x23, [x15, #0x60]\n"
+    "smlal v2.4s, v11.4h, v31.4h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "ldr x22, [x15, #0x68]\n"
+    "ldr x21, [x15, #0x70]\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "smlal v9.4s, v4.4h, v16.4h\n"
+    "ldr x20, [x15, #0x78]\n"
+    "tst x7, #0x7\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "ldr d27, [x26, x17]\n"
+    "smlal2 v30.4s, v11.8h, v31.8h\n"
+    "ldr d11, [x25, x17]\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "add x13, x13, #0x20\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal v10.4s, v22.4h, v20.4h\n"
+    "usubl v11.8h, v11.8b, v14.8b\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v24.4s, v4.8h, v16.8h\n"
+    "smlal v9.4s, v8.4h, v1.4h\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "ldr d15, [x24, x17]\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v20.8h\n"
+    "ldr d22, [x23, x17]\n"
+    "smlal v7.4s, v4.4h, v23.4h\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v2.4s, v27.4h, v18.4h\n"
+    "smlal v10.4s, v27.4h, v26.4h\n"
+    "smlal2 v24.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v20.4h\n"
+    "smlal2 v0.4s, v4.8h, v23.8h\n"
+    "ldr d4, [x22, x17]\n"
+    "smlal2 v30.4s, v27.8h, v18.8h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v26.8h\n"
+    "ldr d26, [x21, x17]\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "usubl v26.8h, v26.8b, v14.8b\n"
+    "smlal v2.4s, v11.4h, v23.4h\n"
+    "smlal v10.4s, v15.4h, v1.4h\n"
+    "smlal2 v24.4s, v27.8h, v20.8h\n"
+    "smlal v9.4s, v11.4h, v5.4h\n"
+    "smlal2 v0.4s, v8.8h, v16.8h\n"
+    "ldr d16, [x20, x17]\n"
+    "smlal2 v30.4s, v11.8h, v23.8h\n"
+    "usubl v16.8h, v16.8b, v14.8b\n"
+    "smlal2 v6.4s, v15.8h, v1.8h\n"
+    "smlal v7.4s, v27.4h, v25.4h\n"
+    "add x17, x17, #0x8\n"
+    "smlal v2.4s, v22.4h, v5.4h\n"
+    "smlal v10.4s, v4.4h, v18.4h\n"
+    "smlal2 v24.4s, v11.8h, v5.8h\n"
+    "smlal v9.4s, v22.4h, v31.4h\n"
+    "sqrdmulh v9.4s, v9.4s, v28.4s\n"
+    "smlal2 v0.4s, v27.8h, v25.8h\n"
+    "smlal2 v30.4s, v22.8h, v5.8h\n"
+    "and v1.16b, v9.16b, v17.16b\n"
+    "smlal2 v6.4s, v4.8h, v18.8h\n"
+    "smlal v7.4s, v15.4h, v18.4h\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "smlal v2.4s, v26.4h, v25.4h\n"
+    "smlal v10.4s, v26.4h, v31.4h\n"
+    "sqadd v9.4s, v9.4s, v1.4s\n"
+    "smlal2 v24.4s, v22.8h, v31.8h\n"
+    "smlal2 v0.4s, v15.8h, v18.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    "smlal2 v30.4s, v26.8h, v25.8h\n"
+    "smlal2 v6.4s, v26.8h, v31.8h\n"
+    "and v31.16b, v24.16b, v3.16b\n"
+    "smlal v7.4s, v4.4h, v20.4h\n"
+    "smlal v2.4s, v16.4h, v20.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v28.4s\n"
+    "smlal v10.4s, v16.4h, v25.4h\n"
+    "smlal2 v0.4s, v4.8h, v20.8h\n"
+    "sqrdmulh v2.4s, v2.4s, v28.4s\n"
+    "smlal2 v30.4s, v16.8h, v20.8h\n"
+    "smlal2 v6.4s, v16.8h, v25.8h\n"
+    "sqrdmulh v10.4s, v10.4s, v28.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v22.16b, v7.16b, v17.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+    "and v15.16b, v2.16b, v17.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "and v11.16b, v10.16b, v17.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v31.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v3.16b\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "and v23.16b, v30.16b, v3.16b\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "and v21.16b, v6.16b, v3.16b\n"
+    "sqadd v7.4s, v7.4s, v22.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v15.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v11.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v17.4s\n"
+    "srshl v7.4s, v7.4s, v17.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v2.4s, v2.4s, v17.4s\n"
+    "sqadd v30.4s, v30.4s, v23.4s\n"
+    "srshl v10.4s, v10.4s, v17.4s\n"
+    "sqadd v6.4s, v6.4s, v21.4s\n"
+    "srshl v24.4s, v24.4s, v3.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v3.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v3.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v3.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "str d9, [x11, x16]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "str d7, [x10, x16]\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "str d2, [x9, x16]\n"
+    "str d10, [x28, x16]\n"
+    "add x16, x16, #0x8\n"
+    "beq 64f\n"
+    "add x14, x14, #0x48\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x7, #2, 5f\n"
+    "ld1 { v9.4s }, [x20], #0x10\n"
+    "tbz x7, #1, 4f\n"
+    "ld1 { v24.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v24.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v24.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x7, #1, 6f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v9.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "ssubl v23.8h, v23.8b, v19.8b\n"
+    "ssubl v16.8h, v16.8b, v19.8b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "ssubl v1.8h, v1.8b, v19.8b\n"
+    "ssubl v5.8h, v5.8b, v19.8b\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ssubl v26.8h, v26.8b, v19.8b\n"
+    "ssubl v18.8h, v18.8b, v19.8b\n"
+    "ssubl v31.8h, v31.8b, v19.8b\n"
+    "ssubl v25.8h, v25.8b, v19.8b\n"
+    "ssubl v20.8h, v20.8b, v19.8b\n"
+    "add x24, x24, x17\n"
+    "add x23, x23, x17\n"
+    "add x22, x22, x17\n"
+    "add x21, x21, x17\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 9f\n"
+    "ld1 { v22.s }[0], [x24], #0x4\n"
+    "ld1 { v4.s }[0], [x23], #0x4\n"
+    "ld1 { v8.s }[0], [x22], #0x4\n"
+    "ld1 { v27.s }[0], [x21], #0x4\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 8f\n"
+    "ld1 { v22.h }[2], [x24], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v8.h }[2], [x22], #0x2\n"
+    "ld1 { v27.h }[2], [x21], #0x2\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[6], [x24]\n"
+    "ld1 { v4.b }[6], [x23]\n"
+    "ld1 { v8.b }[6], [x22]\n"
+    "ld1 { v27.b }[6], [x21]\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[4], [x24]\n"
+    "ld1 { v4.b }[4], [x23]\n"
+    "ld1 { v8.b }[4], [x22]\n"
+    "ld1 { v27.b }[4], [x21]\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x7, #1, 10f\n"
+    "ld1 { v22.h }[0], [x24], #0x2\n"
+    "ld1 { v4.h }[0], [x23], #0x2\n"
+    "ld1 { v8.h }[0], [x22], #0x2\n"
+    "ld1 { v27.h }[0], [x21], #0x2\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[2], [x24]\n"
+    "ld1 { v4.b }[2], [x23]\n"
+    "ld1 { v8.b }[2], [x22]\n"
+    "ld1 { v27.b }[2], [x21]\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[0], [x24]\n"
+    "ld1 { v4.b }[0], [x23]\n"
+    "ld1 { v8.b }[0], [x22]\n"
+    "ld1 { v27.b }[0], [x21]\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "tbz x7, #2, 13f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 12f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x7, #1, 14f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (3, 0): Bit 2: End
+    "usubl v21.8h, v21.8b, v14.8b\n"
+    "smlal v2.4s, v21.4h, v31.4h\n"
+    "smlal2 v30.4s, v21.8h, v31.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "tbz x7, #2, 17f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 16f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x7, #1, 18f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (3, 3): Bit 2: End
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "ldr x20, [x15, #0x38]\n"
+    "smlal v10.4s, v28.4h, v20.4h\n"
+    "smlal2 v6.4s, v28.8h, v20.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 21f\n"
+    "ld1 { v22.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 20f\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 1): Bit 2: Unset
+    "tbz x7, #1, 22f\n"
+    "ld1 { v22.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (0, 1): Bit 2: End
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v9.4s, v22.4h, v16.4h\n"
+    "smlal2 v24.4s, v22.8h, v16.8h\n"
+    "smlal v7.4s, v22.4h, v23.4h\n"
+    "smlal2 v0.4s, v22.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 25f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 24f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (0, 2): Bit 2: Unset
+    "tbz x7, #1, 26f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (0, 2): Bit 2: End
+    "usubl v21.8h, v21.8b, v14.8b\n"
+    "ldr x20, [x15, #0x48]\n"
+    "smlal v9.4s, v21.4h, v1.4h\n"
+    "smlal2 v24.4s, v21.8h, v1.8h\n"
+    "smlal v7.4s, v21.4h, v16.4h\n"
+    "smlal2 v0.4s, v21.8h, v16.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 29f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 28f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x7, #1, 30f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "ldr x20, [x15, #0x50]\n"
+    "smlal v9.4s, v28.4h, v20.4h\n"
+    "smlal2 v24.4s, v28.8h, v20.8h\n"
+    "smlal v7.4s, v28.4h, v25.4h\n"
+    "smlal2 v0.4s, v28.8h, v25.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v2.4s, v28.4h, v18.4h\n"
+    "smlal2 v30.4s, v28.8h, v18.8h\n"
+    "smlal v10.4s, v28.4h, v26.4h\n"
+    "smlal2 v6.4s, v28.8h, v26.8h\n"
+    "tbz x7, #2, 33f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 32f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (1, 0): Bit 2: Unset
+    "tbz x7, #1, 34f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (1, 0): Bit 2: End
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "ldr x20, [x15, #0x58]\n"
+    "smlal v9.4s, v8.4h, v5.4h\n"
+    "smlal2 v24.4s, v8.8h, v5.8h\n"
+    "smlal v2.4s, v8.4h, v23.4h\n"
+    "smlal2 v30.4s, v8.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 37f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 36f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x7, #1, 38f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (1, 3): Bit 2: End
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "ldr x20, [x15, #0x60]\n"
+    "smlal v7.4s, v8.4h, v18.4h\n"
+    "smlal2 v0.4s, v8.8h, v18.8h\n"
+    "smlal v10.4s, v8.4h, v1.4h\n"
+    "smlal2 v6.4s, v8.8h, v1.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 41f\n"
+    "ld1 { v17.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 40f\n"
+    "ld1 { v17.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x7, #1, 42f\n"
+    "ld1 { v17.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (2, 0): Bit 2: End
+    "usubl v17.8h, v17.8b, v14.8b\n"
+    "ldr x20, [x15, #0x68]\n"
+    "smlal v9.4s, v17.4h, v31.4h\n"
+    "smlal2 v24.4s, v17.8h, v31.8h\n"
+    "smlal v2.4s, v17.4h, v5.4h\n"
+    "smlal2 v30.4s, v17.8h, v5.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 45f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 44f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x7, #1, 46f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "usubl v23.8h, v23.8b, v14.8b\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v7.4s, v23.4h, v20.4h\n"
+    "smlal2 v0.4s, v23.8h, v20.8h\n"
+    "smlal v10.4s, v23.4h, v18.4h\n"
+    "smlal2 v6.4s, v23.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 49f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 48f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x7, #1, 50f\n"
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "usubl v5.8h, v5.8b, v14.8b\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v2.4s, v5.4h, v25.4h\n"
+    "smlal2 v30.4s, v5.8h, v25.8h\n"
+    "smlal v10.4s, v5.4h, v31.4h\n"
+    "smlal2 v6.4s, v5.8h, v31.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 53f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 52f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x7, #1, 54f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "usubl v23.8h, v23.8b, v14.8b\n"
+    "smlal v2.4s, v23.4h, v20.4h\n"
+    "smlal2 v30.4s, v23.8h, v20.8h\n"
+    "smlal v10.4s, v23.4h, v25.4h\n"
+    "smlal2 v6.4s, v23.8h, v25.8h\n"
+    "tbz x7, #2, 57f\n"
+    "ld1 { v15.4s }, [x13], #0x10\n"
+    "ld1 { v19.4s }, [x12], #0x10\n"
+    "tbz x7, #1, 56f\n"
+    "ld1 { v18.d }[0], [x13], #0x8\n"
+    "ld1 { v22.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v18.s }[2], [x13]\n"
+    "ld1 { v22.s }[2], [x12]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v18.s }[0], [x13]\n"
+    "ld1 { v22.s }[0], [x12]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x7, #1, 58f\n"
+    "ld1 { v15.d }[0], [x13], #0x8\n"
+    "ld1 { v19.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v15.s }[2], [x13]\n"
+    "ld1 { v19.s }[2], [x12]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v15.s }[0], [x13]\n"
+    "ld1 { v19.s }[0], [x12]\n"
+    "59:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v9.4s, v9.4s, v15.4s\n"
+    "and v17.16b, v9.16b, v19.16b\n"
+    "add x11, x11, x16\n"
+    "add x10, x10, x16\n"
+    "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "add x9, x9, x16\n"
+    "add x28, x28, x16\n"
+    "and v20.16b, v24.16b, v22.16b\n"
+    "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+    "sqrdmulh v2.4s, v2.4s, v15.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v15.4s\n"
+    "sqadd v9.4s, v9.4s, v17.4s\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "and v21.16b, v7.16b, v19.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+    "and v15.16b, v2.16b, v19.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+    "and v23.16b, v10.16b, v19.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v20.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v22.16b\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "and v17.16b, v30.16b, v22.16b\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v28.16b, v6.16b, v22.16b\n"
+    "sqadd v7.4s, v7.4s, v21.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v15.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v23.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v19.4s\n"
+    "srshl v7.4s, v7.4s, v19.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v2.4s, v2.4s, v19.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "srshl v10.4s, v10.4s, v19.4s\n"
+    "sqadd v6.4s, v6.4s, v28.4s\n"
+    "srshl v24.4s, v24.4s, v22.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v22.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v22.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v22.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "tbz x7, #2, 61f\n"
+    "st1 { v9.s }[0], [x11], #0x4\n"
+    "st1 { v7.s }[0], [x10], #0x4\n"
+    "st1 { v2.s }[0], [x9], #0x4\n"
+    "st1 { v10.s }[0], [x28], #0x4\n"
+    "tbz x7, #1, 60f\n"
+    "st1 { v9.h }[2], [x11], #0x2\n"
+    "st1 { v7.h }[2], [x10], #0x2\n"
+    "st1 { v2.h }[2], [x9], #0x2\n"
+    "st1 { v10.h }[2], [x28], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[6], [x11], #0x1\n"
+    "st1 { v7.b }[6], [x10], #0x1\n"
+    "st1 { v2.b }[6], [x9], #0x1\n"
+    "st1 { v10.b }[6], [x28], #0x1\n"
+    "b 63f\n"
+    "60:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[4], [x11], #0x1\n"
+    "st1 { v7.b }[4], [x10], #0x1\n"
+    "st1 { v2.b }[4], [x9], #0x1\n"
+    "st1 { v10.b }[4], [x28], #0x1\n"
+    "b 63f\n"
+    "61:"  // Oddments: Bit 2: Unset
+    "tbz x7, #1, 62f\n"
+    "st1 { v9.h }[0], [x11], #0x2\n"
+    "st1 { v7.h }[0], [x10], #0x2\n"
+    "st1 { v2.h }[0], [x9], #0x2\n"
+    "st1 { v10.h }[0], [x28], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[2], [x11], #0x1\n"
+    "st1 { v7.b }[2], [x10], #0x1\n"
+    "st1 { v2.b }[2], [x9], #0x1\n"
+    "st1 { v10.b }[2], [x28], #0x1\n"
+    "b 63f\n"
+    "62:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[0], [x11], #0x1\n"
+    "st1 { v7.b }[0], [x10], #0x1\n"
+    "st1 { v2.b }[0], [x9], #0x1\n"
+    "st1 { v10.b }[0], [x28], #0x1\n"
+    "63:"  // Oddments: Bit 2: End
+    "64:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..7c05b36f36
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+
+
+void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const int8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..e9db8e1322
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x8, x7, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v6.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v15.16b }, [x21]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v17.8h }, [x21]\n"
+    "ld1r { v24.8h }, [x20]\n"
+    "mov x17, #0x0\n"
+    "mov x16, #0x0\n"
+    "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x11, x10, [x22, #0x0]\n"
+    "ldp x9, x28, [x22, #0x10]\n"
+    "cbz x8, 3f\n"
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "subs x8, x8, #0x1\n"
+    "ssubl v11.8h, v11.8b, v15.8b\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "ssubl v22.8h, v22.8b, v15.8b\n"
+    "ssubl v14.8h, v14.8b, v15.8b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "ssubl v28.8h, v28.8b, v15.8b\n"
+    "ssubl v18.8h, v18.8b, v15.8b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "ssubl v9.8h, v9.8b, v15.8b\n"
+    "ssubl v26.8h, v26.8b, v15.8b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ssubl v7.8h, v7.8b, v15.8b\n"
+    "ssubl v4.8h, v4.8b, v15.8b\n"
+    "ldr q5, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d25, [x27, x17]\n"
+    "ldr d27, [x26, x17]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d1, [x25, x17]\n"
+    "ldr d2, [x24, x17]\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "ldr d12, [x23, x17]\n"
+    "ldr d16, [x22, x17]\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "ldr d23, [x21, x17]\n"
+    "ldr d10, [x20, x17]\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr q30, [x13, #0x0]\n"
+    "ldr q29, [x12, #0x0]\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x21, [x15, #0x58]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "ldr x25, [x15, #0x60]\n"
+    "ldr x24, [x15, #0x80]\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x88]\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "ldr d25, [x20, x17]\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal v20.4s, v27.4h, v28.4h\n"
+    "smlal v19.4s, v25.4h, v18.4h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "ldr d1, [x25, x17]\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "ldr d2, [x24, x17]\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v28.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v18.8h\n"
+    "ldr d25, [x22, x17]\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "ldr x25, [x15, #0x98]\n"
+    "smlal v20.4s, v1.4h, v11.4h\n"
+    "smlal v19.4s, v2.4h, v22.4h\n"
+    "ldr x24, [x15, #0x50]\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "ldr d16, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "ldr d12, [x20, x17]\n"
+    "ldr x23, [x15, #0x48]\n"
+    "smlal2 v0.4s, v1.8h, v11.8h\n"
+    "smlal2 v31.4s, v2.8h, v22.8h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal v20.4s, v27.4h, v18.4h\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "smlal v19.4s, v25.4h, v9.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "ldr d23, [x25, x17]\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "ldr d11, [x24, x17]\n"
+    "usubl v11.8h, v11.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v18.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v9.8h\n"
+    "ldr d25, [x21, x17]\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "smlal v21.4s, v16.4h, v18.4h\n"
+    "smlal v20.4s, v12.4h, v22.4h\n"
+    "smlal v19.4s, v23.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "ldr d10, [x20, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v5.4s, v11.4h, v9.4h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal2 v8.4s, v16.8h, v18.8h\n"
+    "ldr d18, [x22, x17]\n"
+    "ldr d16, [x21, x17]\n"
+    "smlal2 v0.4s, v12.8h, v22.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal2 v31.4s, v23.8h, v14.8h\n"
+    "ldr q14, [x13, #0x10]\n"
+    "smlal v21.4s, v27.4h, v9.4h\n"
+    "smlal v20.4s, v25.4h, v26.4h\n"
+    "smlal v19.4s, v10.4h, v28.4h\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "ldr x21, [x15, #0xc0]\n"
+    "smlal2 v3.4s, v11.8h, v9.8h\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v26.4h\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v8.4s, v27.8h, v9.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "smlal2 v0.4s, v25.8h, v26.8h\n"
+    "ldr q25, [x12, #0x10]\n"
+    "smlal2 v31.4s, v10.8h, v28.8h\n"
+    "smlal v21.4s, v11.4h, v28.4h\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "add x14, x14, #0x48\n"
+    "smlal v20.4s, v18.4h, v7.4h\n"
+    "smlal v19.4s, v16.4h, v7.4h\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "add x17, x17, #0x8\n"
+    "smlal2 v3.4s, v1.8h, v26.8h\n"
+    "smlal v5.4s, v12.4h, v7.4h\n"
+    "sqrdmulh v5.4s, v5.4s, v30.4s\n"
+    "subs x8, x8, #0x1\n"
+    "smlal2 v8.4s, v11.8h, v28.8h\n"
+    "smlal2 v0.4s, v18.8h, v7.8h\n"
+    "and v28.16b, v5.16b, v29.16b\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v31.4s, v16.8h, v7.8h\n"
+    "smlal v21.4s, v2.4h, v7.4h\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "add x12, x12, #0x20\n"
+    "smlal v20.4s, v10.4h, v9.4h\n"
+    "smlal v19.4s, v22.4h, v26.4h\n"
+    "sqadd v5.4s, v5.4s, v28.4s\n"
+    "smlal2 v3.4s, v12.8h, v7.8h\n"
+    "smlal2 v8.4s, v2.8h, v7.8h\n"
+    "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+    "smlal2 v0.4s, v10.8h, v9.8h\n"
+    "smlal2 v31.4s, v22.8h, v26.8h\n"
+    "and v16.16b, v3.16b, v25.16b\n"
+    "smlal v21.4s, v23.4h, v4.4h\n"
+    "smlal v20.4s, v22.4h, v4.4h\n"
+    "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+    "smlal v19.4s, v27.4h, v4.4h\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+    "smlal2 v0.4s, v22.8h, v4.8h\n"
+    "smlal2 v31.4s, v27.8h, v4.8h\n"
+    "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v12.16b, v21.16b, v29.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+    "and v23.16b, v20.16b, v29.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+    "and v9.16b, v19.16b, v29.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "and v18.16b, v8.16b, v25.16b\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v22.16b, v0.16b, v25.16b\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v25.16b\n"
+    "sqadd v21.4s, v21.4s, v12.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v23.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v9.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v29.4s\n"
+    "srshl v21.4s, v21.4s, v29.4s\n"
+    "sqadd v8.4s, v8.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v29.4s\n"
+    "sqadd v0.4s, v0.4s, v22.4s\n"
+    "srshl v19.4s, v19.4s, v29.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v25.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v25.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v25.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v25.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "str d5, [x11, x16]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d21, [x10, x16]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str d20, [x9, x16]\n"
+    "str d19, [x28, x16]\n"
+    "ldr q5, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "add x16, x16, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ssubl v11.8h, v11.8b, v15.8b\n"
+    "ssubl v22.8h, v22.8b, v15.8b\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ssubl v14.8h, v14.8b, v15.8b\n"
+    "ssubl v28.8h, v28.8b, v15.8b\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ldr d25, [x27, x17]\n"
+    "ssubl v18.8h, v18.8b, v15.8b\n"
+    "ssubl v9.8h, v9.8b, v15.8b\n"
+    "ldr d27, [x26, x17]\n"
+    "ldr d1, [x25, x17]\n"
+    "ssubl v26.8h, v26.8b, v15.8b\n"
+    "ssubl v7.8h, v7.8b, v15.8b\n"
+    "ldr d2, [x24, x17]\n"
+    "ldr d12, [x23, x17]\n"
+    "ssubl v4.8h, v4.8b, v15.8b\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "ldr d16, [x22, x17]\n"
+    "ldr d23, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "ldr d10, [x20, x17]\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q29, [x13, #0x0]\n"
+    "ldr q30, [x12, #0x0]\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x21, [x15, #0x58]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "ldr x25, [x15, #0x60]\n"
+    "ldr x24, [x15, #0x80]\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x88]\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "ldr d25, [x20, x17]\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal v20.4s, v27.4h, v28.4h\n"
+    "smlal v19.4s, v25.4h, v18.4h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "ldr d1, [x25, x17]\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "ldr d2, [x24, x17]\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v28.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v18.8h\n"
+    "ldr d25, [x22, x17]\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "ldr x25, [x15, #0x98]\n"
+    "smlal v20.4s, v1.4h, v11.4h\n"
+    "smlal v19.4s, v2.4h, v22.4h\n"
+    "ldr x24, [x15, #0x50]\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "ldr d16, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "ldr d12, [x20, x17]\n"
+    "ldr x23, [x15, #0x48]\n"
+    "smlal2 v0.4s, v1.8h, v11.8h\n"
+    "smlal2 v31.4s, v2.8h, v22.8h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal v20.4s, v27.4h, v18.4h\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "smlal v19.4s, v25.4h, v9.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "ldr d23, [x25, x17]\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "ldr d11, [x24, x17]\n"
+    "usubl v11.8h, v11.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v18.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v9.8h\n"
+    "ldr d25, [x21, x17]\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "smlal v21.4s, v16.4h, v18.4h\n"
+    "smlal v20.4s, v12.4h, v22.4h\n"
+    "smlal v19.4s, v23.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "ldr d10, [x20, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v5.4s, v11.4h, v9.4h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal2 v8.4s, v16.8h, v18.8h\n"
+    "ldr d16, [x22, x17]\n"
+    "ldr d18, [x21, x17]\n"
+    "smlal2 v0.4s, v12.8h, v22.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal2 v31.4s, v23.8h, v14.8h\n"
+    "ldr q14, [x13, #0x10]\n"
+    "smlal v21.4s, v27.4h, v9.4h\n"
+    "smlal v20.4s, v25.4h, v26.4h\n"
+    "smlal v19.4s, v10.4h, v28.4h\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "smlal2 v3.4s, v11.8h, v9.8h\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v26.4h\n"
+    "tst x7, #0x7\n"
+    "smlal2 v8.4s, v27.8h, v9.8h\n"
+    "ldr d27, [x20, x17]\n"
+    "smlal2 v0.4s, v25.8h, v26.8h\n"
+    "ldr q25, [x12, #0x10]\n"
+    "smlal2 v31.4s, v10.8h, v28.8h\n"
+    "smlal v21.4s, v11.4h, v28.4h\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "add x17, x17, #0x8\n"
+    "smlal v20.4s, v16.4h, v7.4h\n"
+    "smlal v19.4s, v18.4h, v7.4h\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v3.4s, v1.8h, v26.8h\n"
+    "smlal v5.4s, v12.4h, v7.4h\n"
+    "sqrdmulh v5.4s, v5.4s, v29.4s\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v8.4s, v11.8h, v28.8h\n"
+    "smlal2 v0.4s, v16.8h, v7.8h\n"
+    "and v16.16b, v5.16b, v30.16b\n"
+    "smlal2 v31.4s, v18.8h, v7.8h\n"
+    "smlal v21.4s, v2.4h, v7.4h\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smlal v20.4s, v10.4h, v9.4h\n"
+    "smlal v19.4s, v22.4h, v26.4h\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "smlal2 v3.4s, v12.8h, v7.8h\n"
+    "smlal2 v8.4s, v2.8h, v7.8h\n"
+    "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+    "smlal2 v0.4s, v10.8h, v9.8h\n"
+    "smlal2 v31.4s, v22.8h, v26.8h\n"
+    "and v16.16b, v3.16b, v25.16b\n"
+    "smlal v21.4s, v23.4h, v4.4h\n"
+    "smlal v20.4s, v22.4h, v4.4h\n"
+    "sqrdmulh v21.4s, v21.4s, v29.4s\n"
+    "smlal v19.4s, v27.4h, v4.4h\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "sqrdmulh v20.4s, v20.4s, v29.4s\n"
+    "smlal2 v0.4s, v22.8h, v4.8h\n"
+    "smlal2 v31.4s, v27.8h, v4.8h\n"
+    "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v23.16b, v21.16b, v30.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+    "and v27.16b, v20.16b, v30.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+    "and v22.16b, v19.16b, v30.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v14.16b, v8.16b, v25.16b\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v25.16b\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v25.16b\n"
+    "sqadd v21.4s, v21.4s, v23.4s\n"
+    "sshr v14.4s, v14.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v27.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v22.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v30.4s\n"
+    "srshl v21.4s, v21.4s, v30.4s\n"
+    "sqadd v8.4s, v8.4s, v14.4s\n"
+    "srshl v20.4s, v20.4s, v30.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v19.4s, v19.4s, v30.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v25.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v25.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v25.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v25.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "str d5, [x11, x16]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d21, [x10, x16]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str d20, [x9, x16]\n"
+    "str d19, [x28, x16]\n"
+    "add x16, x16, #0x8\n"
+    "beq 88f\n"
+    "add x14, x14, #0x48\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x7, #2, 5f\n"
+    "ld1 { v5.4s }, [x20], #0x10\n"
+    "tbz x7, #1, 4f\n"
+    "ld1 { v3.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v3.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v3.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x7, #1, 6f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v5.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v5.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "ssubl v11.8h, v11.8b, v15.8b\n"
+    "ssubl v22.8h, v22.8b, v15.8b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ssubl v14.8h, v14.8b, v15.8b\n"
+    "ssubl v28.8h, v28.8b, v15.8b\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ssubl v18.8h, v18.8b, v15.8b\n"
+    "ssubl v9.8h, v9.8b, v15.8b\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ssubl v26.8h, v26.8b, v15.8b\n"
+    "ssubl v7.8h, v7.8b, v15.8b\n"
+    "ssubl v4.8h, v4.8b, v15.8b\n"
+    "add x27, x27, x17\n"
+    "add x26, x26, x17\n"
+    "add x25, x25, x17\n"
+    "add x24, x24, x17\n"
+    "add x23, x23, x17\n"
+    "add x22, x22, x17\n"
+    "add x21, x21, x17\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 9f\n"
+    "ld1 { v25.s }[0], [x27], #0x4\n"
+    "ld1 { v27.s }[0], [x26], #0x4\n"
+    "ld1 { v1.s }[0], [x25], #0x4\n"
+    "ld1 { v2.s }[0], [x24], #0x4\n"
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "ld1 { v16.s }[0], [x22], #0x4\n"
+    "ld1 { v23.s }[0], [x21], #0x4\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 8f\n"
+    "ld1 { v25.h }[2], [x27], #0x2\n"
+    "ld1 { v27.h }[2], [x26], #0x2\n"
+    "ld1 { v1.h }[2], [x25], #0x2\n"
+    "ld1 { v2.h }[2], [x24], #0x2\n"
+    "ld1 { v12.h }[2], [x23], #0x2\n"
+    "ld1 { v16.h }[2], [x22], #0x2\n"
+    "ld1 { v23.h }[2], [x21], #0x2\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[6], [x27]\n"
+    "ld1 { v27.b }[6], [x26]\n"
+    "ld1 { v1.b }[6], [x25]\n"
+    "ld1 { v2.b }[6], [x24]\n"
+    "ld1 { v12.b }[6], [x23]\n"
+    "ld1 { v16.b }[6], [x22]\n"
+    "ld1 { v23.b }[6], [x21]\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[4], [x27]\n"
+    "ld1 { v27.b }[4], [x26]\n"
+    "ld1 { v1.b }[4], [x25]\n"
+    "ld1 { v2.b }[4], [x24]\n"
+    "ld1 { v12.b }[4], [x23]\n"
+    "ld1 { v16.b }[4], [x22]\n"
+    "ld1 { v23.b }[4], [x21]\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x7, #1, 10f\n"
+    "ld1 { v25.h }[0], [x27], #0x2\n"
+    "ld1 { v27.h }[0], [x26], #0x2\n"
+    "ld1 { v1.h }[0], [x25], #0x2\n"
+    "ld1 { v2.h }[0], [x24], #0x2\n"
+    "ld1 { v12.h }[0], [x23], #0x2\n"
+    "ld1 { v16.h }[0], [x22], #0x2\n"
+    "ld1 { v23.h }[0], [x21], #0x2\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[2], [x27]\n"
+    "ld1 { v27.b }[2], [x26]\n"
+    "ld1 { v1.b }[2], [x25]\n"
+    "ld1 { v2.b }[2], [x24]\n"
+    "ld1 { v12.b }[2], [x23]\n"
+    "ld1 { v16.b }[2], [x22]\n"
+    "ld1 { v23.b }[2], [x21]\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[0], [x27]\n"
+    "ld1 { v27.b }[0], [x26]\n"
+    "ld1 { v1.b }[0], [x25]\n"
+    "ld1 { v2.b }[0], [x24]\n"
+    "ld1 { v12.b }[0], [x23]\n"
+    "ld1 { v16.b }[0], [x22]\n"
+    "ld1 { v23.b }[0], [x21]\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "tbz x7, #2, 13f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 12f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x7, #1, 14f\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "ldr x20, [x15, #0x48]\n"
+    "smlal v21.4s, v15.4h, v18.4h\n"
+    "smlal2 v8.4s, v15.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 17f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 16f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x7, #1, 18f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x50]\n"
+    "smlal v21.4s, v16.4h, v9.4h\n"
+    "smlal2 v8.4s, v16.8h, v9.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 21f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 20f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (1, 2): Bit 2: Unset
+    "tbz x7, #1, 22f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (1, 2): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x58]\n"
+    "smlal v5.4s, v16.4h, v9.4h\n"
+    "smlal2 v3.4s, v16.8h, v9.8h\n"
+    "smlal v21.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 25f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 24f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x7, #1, 26f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (3, 0): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x60]\n"
+    "smlal v20.4s, v16.4h, v28.4h\n"
+    "smlal2 v0.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 29f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 28f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x7, #1, 30f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 0): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x68]\n"
+    "smlal v5.4s, v16.4h, v26.4h\n"
+    "smlal2 v3.4s, v16.8h, v26.8h\n"
+    "smlal v20.4s, v16.4h, v11.4h\n"
+    "smlal2 v0.4s, v16.8h, v11.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 33f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 32f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x7, #1, 34f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (3, 1): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v20.4s, v16.4h, v18.4h\n"
+    "smlal2 v0.4s, v16.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 37f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 36f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x7, #1, 38f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 1): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v16.4h, v7.4h\n"
+    "smlal2 v3.4s, v16.8h, v7.8h\n"
+    "smlal v20.4s, v16.4h, v22.4h\n"
+    "smlal2 v0.4s, v16.8h, v22.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 41f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 40f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x7, #1, 42f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (3, 3): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x80]\n"
+    "smlal v19.4s, v16.4h, v18.4h\n"
+    "smlal2 v31.4s, v16.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 45f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 44f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x7, #1, 46f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x88]\n"
+    "smlal v21.4s, v16.4h, v7.4h\n"
+    "smlal2 v8.4s, v16.8h, v7.8h\n"
+    "smlal v19.4s, v16.4h, v22.4h\n"
+    "smlal2 v31.4s, v16.8h, v22.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 49f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 48f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x7, #1, 50f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 4): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x90]\n"
+    "smlal v19.4s, v16.4h, v9.4h\n"
+    "smlal2 v31.4s, v16.8h, v9.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 53f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 52f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x7, #1, 54f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (4, 0): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x98]\n"
+    "smlal v20.4s, v16.4h, v26.4h\n"
+    "smlal2 v0.4s, v16.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 57f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 56f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x7, #1, 58f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "59:"  // Oddments: Load (2, 4): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xa0]\n"
+    "smlal v21.4s, v16.4h, v4.4h\n"
+    "smlal2 v8.4s, v16.8h, v4.8h\n"
+    "smlal v19.4s, v16.4h, v14.4h\n"
+    "smlal2 v31.4s, v16.8h, v14.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 61f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 60f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x7, #1, 62f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "63:"  // Oddments: Load (4, 1): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v20.4s, v16.4h, v7.4h\n"
+    "smlal2 v0.4s, v16.8h, v7.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 65f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 64f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x7, #1, 66f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "67:"  // Oddments: Load (3, 2): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "smlal v20.4s, v16.4h, v9.4h\n"
+    "smlal2 v0.4s, v16.8h, v9.8h\n"
+    "smlal v19.4s, v16.4h, v28.4h\n"
+    "smlal2 v31.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 69f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 68f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x7, #1, 70f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "71:"  // Oddments: Load (4, 3): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal v19.4s, v16.4h, v7.4h\n"
+    "smlal2 v31.4s, v16.8h, v7.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 73f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 72f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x7, #1, 74f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 2): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "smlal v20.4s, v16.4h, v4.4h\n"
+    "smlal2 v0.4s, v16.8h, v4.8h\n"
+    "smlal v19.4s, v16.4h, v26.4h\n"
+    "smlal2 v31.4s, v16.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 77f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 76f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x7, #1, 78f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "79:"  // Oddments: Load (4, 4): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v19.4s, v16.4h, v4.4h\n"
+    "smlal2 v31.4s, v16.8h, v4.8h\n"
+    "tbz x7, #2, 81f\n"
+    "ld1 { v14.4s }, [x13], #0x10\n"
+    "ld1 { v25.4s }, [x12], #0x10\n"
+    "tbz x7, #1, 80f\n"
+    "ld1 { v18.d }[0], [x13], #0x8\n"
+    "ld1 { v12.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 83f\n"
+    "ld1 { v18.s }[2], [x13]\n"
+    "ld1 { v12.s }[2], [x12]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 83f\n"
+    "ld1 { v18.s }[0], [x13]\n"
+    "ld1 { v12.s }[0], [x12]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x7, #1, 82f\n"
+    "ld1 { v14.d }[0], [x13], #0x8\n"
+    "ld1 { v25.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 83f\n"
+    "ld1 { v14.s }[2], [x13]\n"
+    "ld1 { v25.s }[2], [x12]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 83f\n"
+    "ld1 { v14.s }[0], [x13]\n"
+    "ld1 { v25.s }[0], [x12]\n"
+    "83:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+    "and v28.16b, v5.16b, v25.16b\n"
+    "add x11, x11, x16\n"
+    "add x10, x10, x16\n"
+    "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "add x9, x9, x16\n"
+    "add x28, x28, x16\n"
+    "and v16.16b, v3.16b, v12.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v14.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v14.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+    "sqadd v5.4s, v5.4s, v28.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v14.16b, v21.16b, v25.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+    "and v6.16b, v20.16b, v25.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+    "and v4.16b, v19.16b, v25.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v14.4s, v14.4s, #0x1f\n"
+    "and v18.16b, v8.16b, v12.16b\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "and v7.16b, v0.16b, v12.16b\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v12.16b\n"
+    "sqadd v21.4s, v21.4s, v14.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v6.4s\n"
+    "sshr v7.4s, v7.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v4.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v25.4s\n"
+    "srshl v21.4s, v21.4s, v25.4s\n"
+    "sqadd v8.4s, v8.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v25.4s\n"
+    "sqadd v0.4s, v0.4s, v7.4s\n"
+    "srshl v19.4s, v19.4s, v25.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v12.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v12.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v12.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v12.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "tbz x7, #2, 85f\n"
+    "st1 { v5.s }[0], [x11], #0x4\n"
+    "st1 { v21.s }[0], [x10], #0x4\n"
+    "st1 { v20.s }[0], [x9], #0x4\n"
+    "st1 { v19.s }[0], [x28], #0x4\n"
+    "tbz x7, #1, 84f\n"
+    "st1 { v5.h }[2], [x11], #0x2\n"
+    "st1 { v21.h }[2], [x10], #0x2\n"
+    "st1 { v20.h }[2], [x9], #0x2\n"
+    "st1 { v19.h }[2], [x28], #0x2\n"
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[6], [x11], #0x1\n"
+    "st1 { v21.b }[6], [x10], #0x1\n"
+    "st1 { v20.b }[6], [x9], #0x1\n"
+    "st1 { v19.b }[6], [x28], #0x1\n"
+    "b 87f\n"
+    "84:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[4], [x11], #0x1\n"
+    "st1 { v21.b }[4], [x10], #0x1\n"
+    "st1 { v20.b }[4], [x9], #0x1\n"
+    "st1 { v19.b }[4], [x28], #0x1\n"
+    "b 87f\n"
+    "85:"  // Oddments: Bit 2: Unset
+    "tbz x7, #1, 86f\n"
+    "st1 { v5.h }[0], [x11], #0x2\n"
+    "st1 { v21.h }[0], [x10], #0x2\n"
+    "st1 { v20.h }[0], [x9], #0x2\n"
+    "st1 { v19.h }[0], [x28], #0x2\n"
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[2], [x11], #0x1\n"
+    "st1 { v21.b }[2], [x10], #0x1\n"
+    "st1 { v20.b }[2], [x9], #0x1\n"
+    "st1 { v19.b }[2], [x28], #0x1\n"
+    "b 87f\n"
+    "86:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[0], [x11], #0x1\n"
+    "st1 { v21.b }[0], [x10], #0x1\n"
+    "st1 { v20.b }[0], [x9], #0x1\n"
+    "st1 { v19.b }[0], [x28], #0x1\n"
+    "87:"  // Oddments: Bit 2: End
+    "88:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..5d53b17e53
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const int8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..df955206e2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,2187 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x2, x1, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v18.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v13.16b }, [x21]\n"
+    "ld1r { v26.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v11.8h }, [x21]\n"
+    "ld1r { v0.8h }, [x20]\n"
+    "mov x3, #0x0\n"
+    "mov x4, #0x0\n"
+    "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x17, x16, [x22, #0x0]\n"
+    "ldp x15, x14, [x22, #0x10]\n"
+    "cbz x2, 3f\n"
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "subs x2, x2, #0x1\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "ssubl v14.8h, v14.8b, v13.8b\n"
+    "ssubl v10.8h, v10.8b, v13.8b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "ldr q7, [x20, #0x0]\n"
+    "ldr q15, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "ldr d31, [x9, x3]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldr d17, [x28, x3]\n"
+    "ldr d30, [x27, x3]\n"
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "ldr d16, [x26, x3]\n"
+    "ldr d3, [x25, x3]\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "ldr d4, [x24, x3]\n"
+    "ldr d25, [x23, x3]\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "ldr d9, [x22, x3]\n"
+    "ldr d29, [x21, x3]\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "ldr d28, [x20, x3]\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr d2, [x6, #0x28]\n"
+    "ldr d27, [x6, #0x30]\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "ldr d1, [x6, #0x38]\n"
+    "ldr d31, [x6, #0x40]\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "ldr d8, [x6, #0x48]\n"
+    "ldr x22, [x5, #0x50]\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "ldr x20, [x5, #0x58]\n"
+    "ldr x21, [x5, #0x60]\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "ldr x22, [x5, #0x70]\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "ldr d14, [x20, x3]\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal v23.4s, v17.4h, v10.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x20, [x5, #0x78]\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x21, [x5, #0x80]\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "ldr d25, [x22, x3]\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v10.8h\n"
+    "ldr d10, [x20, x3]\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "smlal v24.4s, v17.4h, v21.4h\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x24, [x5, #0x88]\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "smlal v7.4s, v30.4h, v2.4h\n"
+    "ldr x20, [x5, #0x90]\n"
+    "ldr x23, [x5, #0x98]\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "ldr d9, [x21, x3]\n"
+    "smlal2 v22.4s, v17.8h, v21.8h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "ldr d21, [x6, #0x50]\n"
+    "smlal v20.4s, v3.4h, v12.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "ldr x22, [x5, #0xa0]\n"
+    "ldr x21, [x5, #0xa8]\n"
+    "smlal2 v15.4s, v30.8h, v2.8h\n"
+    "ldr d30, [x24, x3]\n"
+    "smlal v7.4s, v16.4h, v27.4h\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v12.8h\n"
+    "ldr d3, [x6, #0x58]\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "ldr d12, [x20, x3]\n"
+    "smlal v20.4s, v16.4h, v2.4h\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "smlal v24.4s, v28.4h, v2.4h\n"
+    "smlal v23.4s, v14.4h, v2.4h\n"
+    "ldr x20, [x5, #0xb0]\n"
+    "ldr x13, [x5, #0xb8]\n"
+    "smlal2 v15.4s, v16.8h, v27.8h\n"
+    "smlal v7.4s, v4.4h, v1.4h\n"
+    "ldr x12, [x5, #0xc0]\n"
+    "ldr x11, [x5, #0xc8]\n"
+    "smlal2 v5.4s, v16.8h, v2.8h\n"
+    "ldr d16, [x23, x3]\n"
+    "smlal2 v22.4s, v28.8h, v2.8h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v2.8h\n"
+    "ldr d2, [x6, #0x60]\n"
+    "smlal v20.4s, v4.4h, v27.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v27.4h\n"
+    "smlal v23.4s, v25.4h, v27.4h\n"
+    "ldr x10, [x5, #0xd0]\n"
+    "ldr x9, [x5, #0xd8]\n"
+    "smlal2 v15.4s, v4.8h, v1.8h\n"
+    "smlal v7.4s, v17.4h, v31.4h\n"
+    "ldr x28, [x5, #0xe0]\n"
+    "ldr x27, [x5, #0xe8]\n"
+    "smlal2 v5.4s, v4.8h, v27.8h\n"
+    "ldr d4, [x22, x3]\n"
+    "smlal2 v22.4s, v14.8h, v27.8h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v27.8h\n"
+    "ldr d27, [x6, #0x68]\n"
+    "smlal v20.4s, v17.4h, v1.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v1.4h\n"
+    "smlal v23.4s, v10.4h, v1.4h\n"
+    "ldr x26, [x5, #0xf0]\n"
+    "ldr x25, [x5, #0xf8]\n"
+    "smlal2 v15.4s, v17.8h, v31.8h\n"
+    "smlal v7.4s, v6.4h, v8.4h\n"
+    "ldr x24, [x5, #0x100]\n"
+    "ldr x23, [x5, #0x108]\n"
+    "smlal2 v5.4s, v17.8h, v1.8h\n"
+    "ldr d17, [x21, x3]\n"
+    "smlal2 v22.4s, v25.8h, v1.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v1.8h\n"
+    "ldr d1, [x6, #0x70]\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v31.4h\n"
+    "smlal v23.4s, v9.4h, v31.4h\n"
+    "ldr x22, [x5, #0x110]\n"
+    "ldr x21, [x5, #0x118]\n"
+    "smlal2 v15.4s, v6.8h, v8.8h\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "subs x2, x2, #0x1\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal2 v22.4s, v10.8h, v31.8h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v31.8h\n"
+    "ldr d31, [x6, #0x78]\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v9.4h, v8.4h\n"
+    "smlal v23.4s, v30.4h, v8.4h\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "ldr d28, [x13, x3]\n"
+    "smlal v7.4s, v14.4h, v3.4h\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v5.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x6, #0x80]\n"
+    "smlal2 v22.4s, v9.8h, v8.8h\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "ldr d8, [x12, x3]\n"
+    "smlal v20.4s, v14.4h, v21.4h\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "smlal v24.4s, v12.4h, v21.4h\n"
+    "smlal v23.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v14.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v14.8h, v21.8h\n"
+    "ldr d14, [x11, x3]\n"
+    "smlal2 v22.4s, v12.8h, v21.8h\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v21.8h\n"
+    "ldr d21, [x6, #0x88]\n"
+    "smlal v20.4s, v25.4h, v3.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v16.4h, v3.4h\n"
+    "smlal v23.4s, v4.4h, v3.4h\n"
+    "smlal2 v15.4s, v25.8h, v2.8h\n"
+    "smlal v7.4s, v10.4h, v27.4h\n"
+    "smlal2 v5.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x10, x3]\n"
+    "smlal2 v22.4s, v16.8h, v3.8h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v3.8h\n"
+    "ldr d3, [x6, #0x90]\n"
+    "smlal v20.4s, v10.4h, v2.4h\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v4.4h, v2.4h\n"
+    "smlal v23.4s, v17.4h, v2.4h\n"
+    "smlal2 v15.4s, v10.8h, v27.8h\n"
+    "smlal v7.4s, v9.4h, v1.4h\n"
+    "smlal2 v5.4s, v10.8h, v2.8h\n"
+    "ldr d10, [x9, x3]\n"
+    "smlal2 v22.4s, v4.8h, v2.8h\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v2.8h\n"
+    "ldr d2, [x6, #0x98]\n"
+    "smlal v20.4s, v9.4h, v27.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v17.4h, v27.4h\n"
+    "smlal v23.4s, v6.4h, v27.4h\n"
+    "smlal2 v15.4s, v9.8h, v1.8h\n"
+    "smlal v7.4s, v12.4h, v31.4h\n"
+    "smlal2 v5.4s, v9.8h, v27.8h\n"
+    "ldr d9, [x28, x3]\n"
+    "smlal2 v22.4s, v17.8h, v27.8h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v27.8h\n"
+    "ldr d27, [x6, #0xa0]\n"
+    "smlal v20.4s, v30.4h, v1.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v1.4h\n"
+    "smlal v23.4s, v28.4h, v1.4h\n"
+    "smlal2 v15.4s, v12.8h, v31.8h\n"
+    "ldr d12, [x27, x3]\n"
+    "smlal v7.4s, v16.4h, v29.4h\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "smlal2 v5.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x6, #0xa8]\n"
+    "smlal2 v22.4s, v6.8h, v1.8h\n"
+    "ssubl v30.8h, v30.8b, v13.8b\n"
+    "smlal2 v19.4s, v28.8h, v1.8h\n"
+    "ldr d1, [x26, x3]\n"
+    "smlal v20.4s, v16.4h, v31.4h\n"
+    "usubl v1.8h, v1.8b, v18.8b\n"
+    "smlal v24.4s, v8.4h, v31.4h\n"
+    "smlal v23.4s, v14.4h, v31.4h\n"
+    "smlal2 v15.4s, v16.8h, v29.8h\n"
+    "smlal v7.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v16.8h, v31.8h\n"
+    "ldr d16, [x25, x3]\n"
+    "smlal2 v22.4s, v8.8h, v31.8h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v31.8h\n"
+    "ldr d31, [x6, #0xb0]\n"
+    "smlal v20.4s, v4.4h, v29.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v29.4h\n"
+    "smlal v23.4s, v25.4h, v29.4h\n"
+    "smlal2 v15.4s, v4.8h, v21.8h\n"
+    "smlal v7.4s, v17.4h, v3.4h\n"
+    "smlal2 v5.4s, v4.8h, v29.8h\n"
+    "ldr d4, [x24, x3]\n"
+    "smlal2 v22.4s, v14.8h, v29.8h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v29.8h\n"
+    "ldr d29, [x6, #0xb8]\n"
+    "smlal v20.4s, v17.4h, v21.4h\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v15.4s, v17.8h, v3.8h\n"
+    "smlal v7.4s, v6.4h, v2.4h\n"
+    "smlal2 v5.4s, v17.8h, v21.8h\n"
+    "ldr d17, [x23, x3]\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "ldr d21, [x6, #0xc0]\n"
+    "smlal v20.4s, v6.4h, v3.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v3.4h\n"
+    "smlal v23.4s, v9.4h, v3.4h\n"
+    "add x6, x6, #0xc8\n"
+    "smlal2 v15.4s, v6.8h, v2.8h\n"
+    "smlal v7.4s, v8.4h, v27.4h\n"
+    "smlal2 v5.4s, v6.8h, v3.8h\n"
+    "ldr d6, [x22, x3]\n"
+    "smlal2 v22.4s, v10.8h, v3.8h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v3.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal v20.4s, v28.4h, v2.4h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v24.4s, v9.4h, v2.4h\n"
+    "smlal v23.4s, v12.4h, v2.4h\n"
+    "add x3, x3, #0x8\n"
+    "smlal2 v15.4s, v8.8h, v27.8h\n"
+    "ldr q8, [x7, #0x0]\n"
+    "smlal v7.4s, v14.4h, v30.4h\n"
+    "smlal2 v5.4s, v28.8h, v2.8h\n"
+    "ldr q28, [x8, #0x0]\n"
+    "smlal2 v22.4s, v9.8h, v2.8h\n"
+    "smlal2 v19.4s, v12.8h, v2.8h\n"
+    "ldr q2, [x7, #0x10]\n"
+    "smlal v20.4s, v14.4h, v27.4h\n"
+    "add x7, x7, #0x20\n"
+    "smlal v24.4s, v1.4h, v27.4h\n"
+    "smlal v23.4s, v16.4h, v27.4h\n"
+    "smlal2 v15.4s, v14.8h, v30.8h\n"
+    "smlal v7.4s, v25.4h, v31.4h\n"
+    "smlal2 v5.4s, v14.8h, v27.8h\n"
+    "ldr q14, [x8, #0x10]\n"
+    "smlal2 v22.4s, v1.8h, v27.8h\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v19.4s, v16.8h, v27.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal v24.4s, v16.4h, v30.4h\n"
+    "smlal v23.4s, v4.4h, v30.4h\n"
+    "smlal2 v15.4s, v25.8h, v31.8h\n"
+    "smlal v7.4s, v10.4h, v29.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal2 v22.4s, v16.8h, v30.8h\n"
+    "smlal2 v19.4s, v4.8h, v30.8h\n"
+    "smlal v20.4s, v10.4h, v31.4h\n"
+    "smlal v24.4s, v4.4h, v31.4h\n"
+    "smlal v23.4s, v17.4h, v31.4h\n"
+    "smlal2 v15.4s, v10.8h, v29.8h\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+    "smlal2 v5.4s, v10.8h, v31.8h\n"
+    "smlal2 v22.4s, v4.8h, v31.8h\n"
+    "and v27.16b, v7.16b, v28.16b\n"
+    "smlal2 v19.4s, v17.8h, v31.8h\n"
+    "smlal v20.4s, v9.4h, v29.4h\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "smlal v24.4s, v17.4h, v29.4h\n"
+    "smlal v23.4s, v6.4h, v29.4h\n"
+    "sqadd v7.4s, v7.4s, v27.4s\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal2 v5.4s, v9.8h, v29.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v2.4s\n"
+    "smlal2 v22.4s, v17.8h, v29.8h\n"
+    "smlal2 v19.4s, v6.8h, v29.8h\n"
+    "and v9.16b, v15.16b, v14.16b\n"
+    "smlal v20.4s, v12.4h, v21.4h\n"
+    "smlal v24.4s, v6.4h, v21.4h\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "smlal v23.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v12.8h, v21.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "smlal2 v22.4s, v6.8h, v21.8h\n"
+    "smlal2 v19.4s, v3.8h, v21.8h\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "and v25.16b, v20.16b, v28.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v2.4s\n"
+    "and v10.16b, v24.16b, v28.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+    "and v21.16b, v23.16b, v28.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+    "sqadd v15.4s, v15.4s, v9.4s\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "and v9.16b, v5.16b, v14.16b\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "and v12.16b, v22.16b, v14.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v17.16b, v19.16b, v14.16b\n"
+    "sqadd v20.4s, v20.4s, v25.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v10.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v21.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v28.4s\n"
+    "srshl v20.4s, v20.4s, v28.4s\n"
+    "sqadd v5.4s, v5.4s, v9.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqadd v22.4s, v22.4s, v12.4s\n"
+    "srshl v23.4s, v23.4s, v28.4s\n"
+    "sqadd v19.4s, v19.4s, v17.4s\n"
+    "srshl v15.4s, v15.4s, v14.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v14.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v14.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str d7, [x17, x4]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str d20, [x16, x4]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d24, [x15, x4]\n"
+    "str d23, [x14, x4]\n"
+    "ldr q7, [x20, #0x0]\n"
+    "ldr q15, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "add x4, x4, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ssubl v14.8h, v14.8b, v13.8b\n"
+    "ldr d31, [x9, x3]\n"
+    "ldr d17, [x28, x3]\n"
+    "ssubl v10.8h, v10.8b, v13.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ldr d30, [x27, x3]\n"
+    "ldr d16, [x26, x3]\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "ldr d3, [x25, x3]\n"
+    "ldr d4, [x24, x3]\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "ldr d25, [x23, x3]\n"
+    "ldr d9, [x22, x3]\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "ldr d29, [x21, x3]\n"
+    "ldr d28, [x20, x3]\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr d27, [x6, #0x28]\n"
+    "ldr d1, [x6, #0x30]\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "ldr d2, [x6, #0x38]\n"
+    "ldr d31, [x6, #0x40]\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "ldr d8, [x6, #0x48]\n"
+    "ldr x22, [x5, #0x50]\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "ldr x20, [x5, #0x58]\n"
+    "ldr x21, [x5, #0x60]\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x22, [x5, #0x70]\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "ldr d14, [x20, x3]\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal v23.4s, v17.4h, v10.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x21, [x5, #0x78]\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x20, [x5, #0x80]\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "ldr d25, [x22, x3]\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v10.8h\n"
+    "ldr d10, [x21, x3]\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "smlal v24.4s, v17.4h, v21.4h\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x24, [x5, #0x88]\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "smlal v7.4s, v30.4h, v27.4h\n"
+    "ldr x23, [x5, #0x90]\n"
+    "ldr x22, [x5, #0x98]\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "ldr d9, [x20, x3]\n"
+    "smlal2 v22.4s, v17.8h, v21.8h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "ldr d21, [x6, #0x50]\n"
+    "smlal v20.4s, v3.4h, v12.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "ldr x21, [x5, #0xa0]\n"
+    "ldr x20, [x5, #0xa8]\n"
+    "smlal2 v15.4s, v30.8h, v27.8h\n"
+    "ldr d30, [x24, x3]\n"
+    "smlal v7.4s, v16.4h, v1.4h\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v12.8h\n"
+    "ldr d3, [x6, #0x58]\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "ldr d12, [x23, x3]\n"
+    "smlal v20.4s, v16.4h, v27.4h\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "smlal v24.4s, v28.4h, v27.4h\n"
+    "smlal v23.4s, v14.4h, v27.4h\n"
+    "ldr x13, [x5, #0xb0]\n"
+    "ldr x12, [x5, #0xb8]\n"
+    "smlal2 v15.4s, v16.8h, v1.8h\n"
+    "smlal v7.4s, v4.4h, v2.4h\n"
+    "ldr x11, [x5, #0xc0]\n"
+    "ldr x10, [x5, #0xc8]\n"
+    "smlal2 v5.4s, v16.8h, v27.8h\n"
+    "ldr d16, [x22, x3]\n"
+    "smlal2 v22.4s, v28.8h, v27.8h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v27.8h\n"
+    "ldr d27, [x6, #0x60]\n"
+    "smlal v20.4s, v4.4h, v1.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v1.4h\n"
+    "smlal v23.4s, v25.4h, v1.4h\n"
+    "ldr x9, [x5, #0xd0]\n"
+    "ldr x28, [x5, #0xd8]\n"
+    "smlal2 v15.4s, v4.8h, v2.8h\n"
+    "smlal v7.4s, v17.4h, v31.4h\n"
+    "ldr x27, [x5, #0xe0]\n"
+    "ldr x26, [x5, #0xe8]\n"
+    "smlal2 v5.4s, v4.8h, v1.8h\n"
+    "ldr d4, [x21, x3]\n"
+    "smlal2 v22.4s, v14.8h, v1.8h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v1.8h\n"
+    "ldr d1, [x6, #0x68]\n"
+    "smlal v20.4s, v17.4h, v2.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v2.4h\n"
+    "smlal v23.4s, v10.4h, v2.4h\n"
+    "ldr x25, [x5, #0xf0]\n"
+    "ldr x24, [x5, #0xf8]\n"
+    "smlal2 v15.4s, v17.8h, v31.8h\n"
+    "smlal v7.4s, v6.4h, v8.4h\n"
+    "ldr x23, [x5, #0x100]\n"
+    "ldr x22, [x5, #0x108]\n"
+    "smlal2 v5.4s, v17.8h, v2.8h\n"
+    "ldr d17, [x20, x3]\n"
+    "smlal2 v22.4s, v25.8h, v2.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v2.8h\n"
+    "ldr d2, [x6, #0x70]\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v31.4h\n"
+    "smlal v23.4s, v9.4h, v31.4h\n"
+    "ldr x21, [x5, #0x110]\n"
+    "ldr x20, [x5, #0x118]\n"
+    "smlal2 v15.4s, v6.8h, v8.8h\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "tst x1, #0x7\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "ldr d6, [x13, x3]\n"
+    "smlal2 v22.4s, v10.8h, v31.8h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v31.8h\n"
+    "ldr d31, [x6, #0x78]\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v9.4h, v8.4h\n"
+    "smlal v23.4s, v30.4h, v8.4h\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "ldr d28, [x12, x3]\n"
+    "smlal v7.4s, v14.4h, v3.4h\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v5.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x6, #0x80]\n"
+    "smlal2 v22.4s, v9.8h, v8.8h\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "ldr d8, [x11, x3]\n"
+    "smlal v20.4s, v14.4h, v21.4h\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "smlal v24.4s, v12.4h, v21.4h\n"
+    "smlal v23.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v14.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v27.4h\n"
+    "smlal2 v5.4s, v14.8h, v21.8h\n"
+    "ldr d14, [x10, x3]\n"
+    "smlal2 v22.4s, v12.8h, v21.8h\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v21.8h\n"
+    "ldr d21, [x6, #0x88]\n"
+    "smlal v20.4s, v25.4h, v3.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v16.4h, v3.4h\n"
+    "smlal v23.4s, v4.4h, v3.4h\n"
+    "smlal2 v15.4s, v25.8h, v27.8h\n"
+    "smlal v7.4s, v10.4h, v1.4h\n"
+    "smlal2 v5.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x9, x3]\n"
+    "smlal2 v22.4s, v16.8h, v3.8h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v3.8h\n"
+    "ldr d3, [x6, #0x90]\n"
+    "smlal v20.4s, v10.4h, v27.4h\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v4.4h, v27.4h\n"
+    "smlal v23.4s, v17.4h, v27.4h\n"
+    "smlal2 v15.4s, v10.8h, v1.8h\n"
+    "smlal v7.4s, v9.4h, v2.4h\n"
+    "smlal2 v5.4s, v10.8h, v27.8h\n"
+    "ldr d10, [x28, x3]\n"
+    "smlal2 v22.4s, v4.8h, v27.8h\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v27.8h\n"
+    "ldr d27, [x6, #0x98]\n"
+    "smlal v20.4s, v9.4h, v1.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v17.4h, v1.4h\n"
+    "smlal v23.4s, v6.4h, v1.4h\n"
+    "smlal2 v15.4s, v9.8h, v2.8h\n"
+    "smlal v7.4s, v12.4h, v31.4h\n"
+    "smlal2 v5.4s, v9.8h, v1.8h\n"
+    "ldr d9, [x27, x3]\n"
+    "smlal2 v22.4s, v17.8h, v1.8h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v1.8h\n"
+    "ldr d1, [x6, #0xa0]\n"
+    "smlal v20.4s, v30.4h, v2.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v2.4h\n"
+    "smlal v23.4s, v28.4h, v2.4h\n"
+    "smlal2 v15.4s, v12.8h, v31.8h\n"
+    "ldr d12, [x26, x3]\n"
+    "smlal v7.4s, v16.4h, v29.4h\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "smlal2 v5.4s, v30.8h, v2.8h\n"
+    "ldr d30, [x6, #0xa8]\n"
+    "smlal2 v22.4s, v6.8h, v2.8h\n"
+    "ssubl v30.8h, v30.8b, v13.8b\n"
+    "smlal2 v19.4s, v28.8h, v2.8h\n"
+    "ldr d2, [x25, x3]\n"
+    "smlal v20.4s, v16.4h, v31.4h\n"
+    "usubl v2.8h, v2.8b, v18.8b\n"
+    "smlal v24.4s, v8.4h, v31.4h\n"
+    "smlal v23.4s, v14.4h, v31.4h\n"
+    "smlal2 v15.4s, v16.8h, v29.8h\n"
+    "smlal v7.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v16.8h, v31.8h\n"
+    "ldr d16, [x24, x3]\n"
+    "smlal2 v22.4s, v8.8h, v31.8h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v31.8h\n"
+    "ldr d31, [x6, #0xb0]\n"
+    "smlal v20.4s, v4.4h, v29.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v29.4h\n"
+    "smlal v23.4s, v25.4h, v29.4h\n"
+    "smlal2 v15.4s, v4.8h, v21.8h\n"
+    "smlal v7.4s, v17.4h, v3.4h\n"
+    "smlal2 v5.4s, v4.8h, v29.8h\n"
+    "ldr d4, [x23, x3]\n"
+    "smlal2 v22.4s, v14.8h, v29.8h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v29.8h\n"
+    "ldr d29, [x6, #0xb8]\n"
+    "smlal v20.4s, v17.4h, v21.4h\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v15.4s, v17.8h, v3.8h\n"
+    "smlal v7.4s, v6.4h, v27.4h\n"
+    "smlal2 v5.4s, v17.8h, v21.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "ldr d21, [x6, #0xc0]\n"
+    "smlal v20.4s, v6.4h, v3.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v3.4h\n"
+    "smlal v23.4s, v9.4h, v3.4h\n"
+    "smlal2 v15.4s, v6.8h, v27.8h\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "smlal2 v5.4s, v6.8h, v3.8h\n"
+    "ldr d6, [x21, x3]\n"
+    "smlal2 v22.4s, v10.8h, v3.8h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v3.8h\n"
+    "ldr d3, [x20, x3]\n"
+    "smlal v20.4s, v28.4h, v27.4h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v24.4s, v9.4h, v27.4h\n"
+    "smlal v23.4s, v12.4h, v27.4h\n"
+    "add x3, x3, #0x8\n"
+    "smlal2 v15.4s, v8.8h, v1.8h\n"
+    "ldr q8, [x7, #0x0]\n"
+    "smlal v7.4s, v14.4h, v30.4h\n"
+    "smlal2 v5.4s, v28.8h, v27.8h\n"
+    "ldr q28, [x8, #0x0]\n"
+    "smlal2 v22.4s, v9.8h, v27.8h\n"
+    "smlal2 v19.4s, v12.8h, v27.8h\n"
+    "ldr q27, [x7, #0x10]\n"
+    "smlal v20.4s, v14.4h, v1.4h\n"
+    "add x7, x7, #0x20\n"
+    "smlal v24.4s, v2.4h, v1.4h\n"
+    "smlal v23.4s, v16.4h, v1.4h\n"
+    "smlal2 v15.4s, v14.8h, v30.8h\n"
+    "smlal v7.4s, v25.4h, v31.4h\n"
+    "smlal2 v5.4s, v14.8h, v1.8h\n"
+    "ldr q14, [x8, #0x10]\n"
+    "smlal2 v22.4s, v2.8h, v1.8h\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v19.4s, v16.8h, v1.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal v24.4s, v16.4h, v30.4h\n"
+    "smlal v23.4s, v4.4h, v30.4h\n"
+    "smlal2 v15.4s, v25.8h, v31.8h\n"
+    "smlal v7.4s, v10.4h, v29.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal2 v22.4s, v16.8h, v30.8h\n"
+    "smlal2 v19.4s, v4.8h, v30.8h\n"
+    "smlal v20.4s, v10.4h, v31.4h\n"
+    "smlal v24.4s, v4.4h, v31.4h\n"
+    "smlal v23.4s, v17.4h, v31.4h\n"
+    "smlal2 v15.4s, v10.8h, v29.8h\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+    "smlal2 v5.4s, v10.8h, v31.8h\n"
+    "smlal2 v22.4s, v4.8h, v31.8h\n"
+    "and v4.16b, v7.16b, v28.16b\n"
+    "smlal2 v19.4s, v17.8h, v31.8h\n"
+    "smlal v20.4s, v9.4h, v29.4h\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "smlal v24.4s, v17.4h, v29.4h\n"
+    "smlal v23.4s, v6.4h, v29.4h\n"
+    "sqadd v7.4s, v7.4s, v4.4s\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal2 v5.4s, v9.8h, v29.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v27.4s\n"
+    "smlal2 v22.4s, v17.8h, v29.8h\n"
+    "smlal2 v19.4s, v6.8h, v29.8h\n"
+    "and v30.16b, v15.16b, v14.16b\n"
+    "smlal v20.4s, v12.4h, v21.4h\n"
+    "smlal v24.4s, v6.4h, v21.4h\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "smlal v23.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v12.8h, v21.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "smlal2 v22.4s, v6.8h, v21.8h\n"
+    "smlal2 v19.4s, v3.8h, v21.8h\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "and v3.16b, v20.16b, v28.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v27.4s\n"
+    "and v25.16b, v24.16b, v28.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+    "and v16.16b, v23.16b, v28.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+    "sqadd v15.4s, v15.4s, v30.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "and v4.16b, v5.16b, v14.16b\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "and v10.16b, v22.16b, v14.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v12.16b, v19.16b, v14.16b\n"
+    "sqadd v20.4s, v20.4s, v3.4s\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v25.4s\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v28.4s\n"
+    "srshl v20.4s, v20.4s, v28.4s\n"
+    "sqadd v5.4s, v5.4s, v4.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqadd v22.4s, v22.4s, v10.4s\n"
+    "srshl v23.4s, v23.4s, v28.4s\n"
+    "sqadd v19.4s, v19.4s, v12.4s\n"
+    "srshl v15.4s, v15.4s, v14.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v14.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v14.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str d7, [x17, x4]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str d20, [x16, x4]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d24, [x15, x4]\n"
+    "str d23, [x14, x4]\n"
+    "add x4, x4, #0x8\n"
+    "beq 124f\n"
+    "add x6, x6, #0xc8\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x1, #2, 5f\n"
+    "ld1 { v7.4s }, [x20], #0x10\n"
+    "tbz x1, #1, 4f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz x1, #0, 7f\n"
+    "ld1 { v15.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 7f\n"
+    "ld1 { v15.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x1, #1, 6f\n"
+    "ld1 { v7.d }[0], [x20], #0x8\n"
+    "tbz x1, #0, 7f\n"
+    "ld1 { v7.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 7f\n"
+    "ld1 { v7.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ssubl v14.8h, v14.8b, v13.8b\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "ssubl v10.8h, v10.8b, v13.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "add x9, x9, x3\n"
+    "add x28, x28, x3\n"
+    "add x27, x27, x3\n"
+    "add x26, x26, x3\n"
+    "add x25, x25, x3\n"
+    "add x24, x24, x3\n"
+    "add x23, x23, x3\n"
+    "add x22, x22, x3\n"
+    "add x21, x21, x3\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 9f\n"
+    "ld1 { v31.s }[0], [x9], #0x4\n"
+    "ld1 { v17.s }[0], [x28], #0x4\n"
+    "ld1 { v30.s }[0], [x27], #0x4\n"
+    "ld1 { v16.s }[0], [x26], #0x4\n"
+    "ld1 { v3.s }[0], [x25], #0x4\n"
+    "ld1 { v4.s }[0], [x24], #0x4\n"
+    "ld1 { v25.s }[0], [x23], #0x4\n"
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v29.s }[0], [x21], #0x4\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 8f\n"
+    "ld1 { v31.h }[2], [x9], #0x2\n"
+    "ld1 { v17.h }[2], [x28], #0x2\n"
+    "ld1 { v30.h }[2], [x27], #0x2\n"
+    "ld1 { v16.h }[2], [x26], #0x2\n"
+    "ld1 { v3.h }[2], [x25], #0x2\n"
+    "ld1 { v4.h }[2], [x24], #0x2\n"
+    "ld1 { v25.h }[2], [x23], #0x2\n"
+    "ld1 { v9.h }[2], [x22], #0x2\n"
+    "ld1 { v29.h }[2], [x21], #0x2\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[6], [x9]\n"
+    "ld1 { v17.b }[6], [x28]\n"
+    "ld1 { v30.b }[6], [x27]\n"
+    "ld1 { v16.b }[6], [x26]\n"
+    "ld1 { v3.b }[6], [x25]\n"
+    "ld1 { v4.b }[6], [x24]\n"
+    "ld1 { v25.b }[6], [x23]\n"
+    "ld1 { v9.b }[6], [x22]\n"
+    "ld1 { v29.b }[6], [x21]\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[4], [x9]\n"
+    "ld1 { v17.b }[4], [x28]\n"
+    "ld1 { v30.b }[4], [x27]\n"
+    "ld1 { v16.b }[4], [x26]\n"
+    "ld1 { v3.b }[4], [x25]\n"
+    "ld1 { v4.b }[4], [x24]\n"
+    "ld1 { v25.b }[4], [x23]\n"
+    "ld1 { v9.b }[4], [x22]\n"
+    "ld1 { v29.b }[4], [x21]\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x1, #1, 10f\n"
+    "ld1 { v31.h }[0], [x9], #0x2\n"
+    "ld1 { v17.h }[0], [x28], #0x2\n"
+    "ld1 { v30.h }[0], [x27], #0x2\n"
+    "ld1 { v16.h }[0], [x26], #0x2\n"
+    "ld1 { v3.h }[0], [x25], #0x2\n"
+    "ld1 { v4.h }[0], [x24], #0x2\n"
+    "ld1 { v25.h }[0], [x23], #0x2\n"
+    "ld1 { v9.h }[0], [x22], #0x2\n"
+    "ld1 { v29.h }[0], [x21], #0x2\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[2], [x9]\n"
+    "ld1 { v17.b }[2], [x28]\n"
+    "ld1 { v30.b }[2], [x27]\n"
+    "ld1 { v16.b }[2], [x26]\n"
+    "ld1 { v3.b }[2], [x25]\n"
+    "ld1 { v4.b }[2], [x24]\n"
+    "ld1 { v25.b }[2], [x23]\n"
+    "ld1 { v9.b }[2], [x22]\n"
+    "ld1 { v29.b }[2], [x21]\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[0], [x9]\n"
+    "ld1 { v17.b }[0], [x28]\n"
+    "ld1 { v30.b }[0], [x27]\n"
+    "ld1 { v16.b }[0], [x26]\n"
+    "ld1 { v3.b }[0], [x25]\n"
+    "ld1 { v4.b }[0], [x24]\n"
+    "ld1 { v25.b }[0], [x23]\n"
+    "ld1 { v9.b }[0], [x22]\n"
+    "ld1 { v29.b }[0], [x21]\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "ldr x20, [x5, #0x50]\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "add x20, x20, x3\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "tbz x1, #2, 13f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 12f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x1, #1, 14f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "usubl v27.8h, v27.8b, v18.8b\n"
+    "ldr x20, [x5, #0x58]\n"
+    "smlal v23.4s, v27.4h, v10.4h\n"
+    "smlal2 v19.4s, v27.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "smlal v24.4s, v27.4h, v21.4h\n"
+    "smlal2 v22.4s, v27.8h, v21.8h\n"
+    "tbz x1, #2, 17f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 16f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x1, #1, 18f\n"
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "ldr x20, [x5, #0x60]\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 21f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 20f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 5): Bit 2: Unset
+    "tbz x1, #1, 22f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (0, 5): Bit 2: End
+    "ldr d14, [x6, #0x28]\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal v20.4s, v9.4h, v12.4h\n"
+    "smlal2 v5.4s, v9.8h, v12.8h\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "ssubl v14.8h, v14.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v30.4h, v14.4h\n"
+    "smlal2 v15.4s, v30.8h, v14.8h\n"
+    "smlal v20.4s, v16.4h, v14.4h\n"
+    "smlal2 v5.4s, v16.8h, v14.8h\n"
+    "smlal v24.4s, v28.4h, v14.4h\n"
+    "smlal2 v22.4s, v28.8h, v14.8h\n"
+    "tbz x1, #2, 25f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 24f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x1, #1, 26f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (2, 1): Bit 2: End
+    "ldr d21, [x6, #0x30]\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0x70]\n"
+    "smlal v23.4s, v25.4h, v14.4h\n"
+    "smlal2 v19.4s, v25.8h, v14.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v16.8h, v21.8h\n"
+    "smlal v20.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v4.8h, v21.8h\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "tbz x1, #2, 29f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 28f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x1, #1, 30f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "ldr d9, [x6, #0x38]\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "ssubl v9.8h, v9.8b, v13.8b\n"
+    "ldr x20, [x5, #0x78]\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v4.4h, v9.4h\n"
+    "smlal2 v15.4s, v4.8h, v9.8h\n"
+    "smlal v20.4s, v27.4h, v9.4h\n"
+    "smlal2 v5.4s, v27.8h, v9.8h\n"
+    "smlal v24.4s, v10.4h, v9.4h\n"
+    "smlal2 v22.4s, v10.8h, v9.8h\n"
+    "tbz x1, #2, 33f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 32f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x1, #1, 34f\n"
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (2, 3): Bit 2: End
+    "ldr d31, [x6, #0x40]\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x20, [x5, #0x80]\n"
+    "smlal v23.4s, v12.4h, v9.4h\n"
+    "smlal2 v19.4s, v12.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v27.4h, v31.4h\n"
+    "smlal2 v15.4s, v27.8h, v31.8h\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "smlal v24.4s, v12.4h, v31.4h\n"
+    "smlal2 v22.4s, v12.8h, v31.8h\n"
+    "tbz x1, #2, 37f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 36f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x1, #1, 38f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 4): Bit 2: End
+    "ldr d16, [x6, #0x48]\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "ssubl v16.8h, v16.8b, v13.8b\n"
+    "ldr x20, [x5, #0x88]\n"
+    "smlal v23.4s, v8.4h, v31.4h\n"
+    "smlal2 v19.4s, v8.8h, v31.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v6.4h, v16.4h\n"
+    "smlal2 v15.4s, v6.8h, v16.8h\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "smlal2 v5.4s, v29.8h, v16.8h\n"
+    "smlal v24.4s, v8.4h, v16.4h\n"
+    "smlal2 v22.4s, v8.8h, v16.8h\n"
+    "tbz x1, #2, 41f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 40f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 5): Bit 2: Unset
+    "tbz x1, #1, 42f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (2, 5): Bit 2: End
+    "ldr d21, [x6, #0x50]\n"
+    "usubl v27.8h, v27.8b, v18.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0x90]\n"
+    "smlal v23.4s, v27.4h, v16.4h\n"
+    "smlal2 v19.4s, v27.8h, v16.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "smlal v20.4s, v25.4h, v21.4h\n"
+    "smlal2 v5.4s, v25.8h, v21.8h\n"
+    "tbz x1, #2, 45f\n"
+    "ld1 { v31.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 44f\n"
+    "ld1 { v31.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x1, #1, 46f\n"
+    "ld1 { v31.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (3, 0): Bit 2: End
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "ldr x20, [x5, #0x98]\n"
+    "smlal v24.4s, v31.4h, v21.4h\n"
+    "smlal2 v22.4s, v31.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 49f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 48f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x1, #1, 50f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "ldr d2, [x6, #0x58]\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x20, [x5, #0xa0]\n"
+    "smlal v23.4s, v28.4h, v21.4h\n"
+    "smlal2 v19.4s, v28.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v25.4h, v2.4h\n"
+    "smlal2 v15.4s, v25.8h, v2.8h\n"
+    "smlal v20.4s, v10.4h, v2.4h\n"
+    "smlal2 v5.4s, v10.8h, v2.8h\n"
+    "smlal v24.4s, v28.4h, v2.4h\n"
+    "smlal2 v22.4s, v28.8h, v2.8h\n"
+    "tbz x1, #2, 53f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 52f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x1, #1, 54f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "ldr d25, [x6, #0x60]\n"
+    "usubl v21.8h, v21.8b, v18.8b\n"
+    "ssubl v25.8h, v25.8b, v13.8b\n"
+    "ldr x20, [x5, #0xa8]\n"
+    "smlal v23.4s, v21.4h, v2.4h\n"
+    "smlal2 v19.4s, v21.8h, v2.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v10.4h, v25.4h\n"
+    "smlal2 v15.4s, v10.8h, v25.8h\n"
+    "smlal v20.4s, v12.4h, v25.4h\n"
+    "smlal2 v5.4s, v12.8h, v25.8h\n"
+    "smlal v24.4s, v21.4h, v25.4h\n"
+    "smlal2 v22.4s, v21.8h, v25.8h\n"
+    "tbz x1, #2, 57f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 56f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x1, #1, 58f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "59:"  // Oddments: Load (3, 3): Bit 2: End
+    "ldr d1, [x6, #0x68]\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x20, [x5, #0xb0]\n"
+    "smlal v23.4s, v9.4h, v25.4h\n"
+    "smlal2 v19.4s, v9.8h, v25.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v12.4h, v1.4h\n"
+    "smlal2 v15.4s, v12.8h, v1.8h\n"
+    "smlal v20.4s, v8.4h, v1.4h\n"
+    "smlal2 v5.4s, v8.8h, v1.8h\n"
+    "smlal v24.4s, v9.4h, v1.4h\n"
+    "smlal2 v22.4s, v9.8h, v1.8h\n"
+    "tbz x1, #2, 61f\n"
+    "ld1 { v3.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 60f\n"
+    "ld1 { v3.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[6], [x20]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[4], [x20]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x1, #1, 62f\n"
+    "ld1 { v3.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[2], [x20]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[0], [x20]\n"
+    "63:"  // Oddments: Load (3, 4): Bit 2: End
+    "ldr d16, [x6, #0x70]\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "ssubl v16.8h, v16.8b, v13.8b\n"
+    "ldr x20, [x5, #0xb8]\n"
+    "smlal v23.4s, v3.4h, v1.4h\n"
+    "smlal2 v19.4s, v3.8h, v1.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "smlal2 v15.4s, v8.8h, v16.8h\n"
+    "smlal v20.4s, v27.4h, v16.4h\n"
+    "smlal2 v5.4s, v27.8h, v16.8h\n"
+    "smlal v24.4s, v3.4h, v16.4h\n"
+    "smlal2 v22.4s, v3.8h, v16.8h\n"
+    "tbz x1, #2, 65f\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 64f\n"
+    "ld1 { v14.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[6], [x20]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[4], [x20]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 5): Bit 2: Unset
+    "tbz x1, #1, 66f\n"
+    "ld1 { v14.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[2], [x20]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[0], [x20]\n"
+    "67:"  // Oddments: Load (3, 5): Bit 2: End
+    "ldr d17, [x6, #0x78]\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "ssubl v17.8h, v17.8b, v13.8b\n"
+    "ldr x20, [x5, #0xc0]\n"
+    "smlal v23.4s, v14.4h, v16.4h\n"
+    "smlal2 v19.4s, v14.8h, v16.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v31.4h, v17.4h\n"
+    "smlal2 v15.4s, v31.8h, v17.8h\n"
+    "smlal v20.4s, v28.4h, v17.4h\n"
+    "smlal2 v5.4s, v28.8h, v17.8h\n"
+    "tbz x1, #2, 69f\n"
+    "ld1 { v1.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 68f\n"
+    "ld1 { v1.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[6], [x20]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[4], [x20]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x1, #1, 70f\n"
+    "ld1 { v1.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[0], [x20]\n"
+    "71:"  // Oddments: Load (4, 0): Bit 2: End
+    "usubl v1.8h, v1.8b, v18.8b\n"
+    "ldr x20, [x5, #0xc8]\n"
+    "smlal v24.4s, v1.4h, v17.4h\n"
+    "smlal2 v22.4s, v1.8h, v17.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 73f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 72f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x1, #1, 74f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 1): Bit 2: End
+    "ldr d29, [x6, #0x80]\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "ldr x20, [x5, #0xd0]\n"
+    "smlal v23.4s, v16.4h, v17.4h\n"
+    "smlal2 v19.4s, v16.8h, v17.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v28.4h, v29.4h\n"
+    "smlal2 v15.4s, v28.8h, v29.8h\n"
+    "smlal v20.4s, v21.4h, v29.4h\n"
+    "smlal2 v5.4s, v21.8h, v29.8h\n"
+    "smlal v24.4s, v16.4h, v29.4h\n"
+    "smlal2 v22.4s, v16.8h, v29.8h\n"
+    "tbz x1, #2, 77f\n"
+    "ld1 { v30.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 76f\n"
+    "ld1 { v30.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[6], [x20]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[4], [x20]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x1, #1, 78f\n"
+    "ld1 { v30.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[2], [x20]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[0], [x20]\n"
+    "79:"  // Oddments: Load (4, 2): Bit 2: End
+    "ldr d12, [x6, #0x88]\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "ldr x20, [x5, #0xd8]\n"
+    "smlal v23.4s, v30.4h, v29.4h\n"
+    "smlal2 v19.4s, v30.8h, v29.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v21.4h, v12.4h\n"
+    "smlal2 v15.4s, v21.8h, v12.8h\n"
+    "smlal v20.4s, v9.4h, v12.4h\n"
+    "smlal2 v5.4s, v9.8h, v12.8h\n"
+    "smlal v24.4s, v30.4h, v12.4h\n"
+    "smlal2 v22.4s, v30.8h, v12.8h\n"
+    "tbz x1, #2, 81f\n"
+    "ld1 { v29.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 80f\n"
+    "ld1 { v29.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[6], [x20]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[4], [x20]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x1, #1, 82f\n"
+    "ld1 { v29.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[2], [x20]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[0], [x20]\n"
+    "83:"  // Oddments: Load (4, 3): Bit 2: End
+    "ldr d21, [x6, #0x90]\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0xe0]\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal v20.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v3.8h, v21.8h\n"
+    "smlal v24.4s, v29.4h, v21.4h\n"
+    "smlal2 v22.4s, v29.8h, v21.8h\n"
+    "tbz x1, #2, 85f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 84f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 87f\n"
+    "84:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 87f\n"
+    "85:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x1, #1, 86f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 87f\n"
+    "86:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "87:"  // Oddments: Load (4, 4): Bit 2: End
+    "ldr d8, [x6, #0x98]\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x20, [x5, #0xe8]\n"
+    "smlal v23.4s, v25.4h, v21.4h\n"
+    "smlal2 v19.4s, v25.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v3.4h, v8.4h\n"
+    "smlal2 v15.4s, v3.8h, v8.8h\n"
+    "smlal v20.4s, v14.4h, v8.4h\n"
+    "smlal2 v5.4s, v14.8h, v8.8h\n"
+    "smlal v24.4s, v25.4h, v8.4h\n"
+    "smlal2 v22.4s, v25.8h, v8.8h\n"
+    "tbz x1, #2, 89f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 88f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 91f\n"
+    "88:"  // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 91f\n"
+    "89:"  // Oddments: Load (4, 5): Bit 2: Unset
+    "tbz x1, #1, 90f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 91f\n"
+    "90:"  // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "91:"  // Oddments: Load (4, 5): Bit 2: End
+    "ldr d9, [x6, #0xa0]\n"
+    "usubl v21.8h, v21.8b, v18.8b\n"
+    "ssubl v9.8h, v9.8b, v13.8b\n"
+    "ldr x20, [x5, #0xf0]\n"
+    "smlal v23.4s, v21.4h, v8.4h\n"
+    "smlal2 v19.4s, v21.8h, v8.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v1.4h, v9.4h\n"
+    "smlal2 v15.4s, v1.8h, v9.8h\n"
+    "smlal v20.4s, v16.4h, v9.4h\n"
+    "smlal2 v5.4s, v16.8h, v9.8h\n"
+    "tbz x1, #2, 93f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 92f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[6], [x20]\n"
+    "b 95f\n"
+    "92:"  // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[4], [x20]\n"
+    "b 95f\n"
+    "93:"  // Oddments: Load (5, 0): Bit 2: Unset
+    "tbz x1, #1, 94f\n"
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[2], [x20]\n"
+    "b 95f\n"
+    "94:"  // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[0], [x20]\n"
+    "95:"  // Oddments: Load (5, 0): Bit 2: End
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "ldr x20, [x5, #0xf8]\n"
+    "smlal v24.4s, v12.4h, v9.4h\n"
+    "smlal2 v22.4s, v12.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 97f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 96f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 99f\n"
+    "96:"  // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 99f\n"
+    "97:"  // Oddments: Load (5, 1): Bit 2: Unset
+    "tbz x1, #1, 98f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 99f\n"
+    "98:"  // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "99:"  // Oddments: Load (5, 1): Bit 2: End
+    "ldr d12, [x6, #0xa8]\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "ldr x20, [x5, #0x100]\n"
+    "smlal v23.4s, v10.4h, v9.4h\n"
+    "smlal2 v19.4s, v10.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v16.4h, v12.4h\n"
+    "smlal2 v15.4s, v16.8h, v12.8h\n"
+    "smlal v20.4s, v30.4h, v12.4h\n"
+    "smlal2 v5.4s, v30.8h, v12.8h\n"
+    "smlal v24.4s, v10.4h, v12.4h\n"
+    "smlal2 v22.4s, v10.8h, v12.8h\n"
+    "tbz x1, #2, 101f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 100f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 103f\n"
+    "100:"  // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 103f\n"
+    "101:"  // Oddments: Load (5, 2): Bit 2: Unset
+    "tbz x1, #1, 102f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 103f\n"
+    "102:"  // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "103:"  // Oddments: Load (5, 2): Bit 2: End
+    "ldr d28, [x6, #0xb0]\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "ssubl v28.8h, v28.8b, v13.8b\n"
+    "ldr x20, [x5, #0x108]\n"
+    "smlal v23.4s, v9.4h, v12.4h\n"
+    "smlal2 v19.4s, v9.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v30.4h, v28.4h\n"
+    "smlal2 v15.4s, v30.8h, v28.8h\n"
+    "smlal v20.4s, v29.4h, v28.4h\n"
+    "smlal2 v5.4s, v29.8h, v28.8h\n"
+    "smlal v24.4s, v9.4h, v28.4h\n"
+    "smlal2 v22.4s, v9.8h, v28.8h\n"
+    "tbz x1, #2, 105f\n"
+    "ld1 { v2.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 104f\n"
+    "ld1 { v2.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[6], [x20]\n"
+    "b 107f\n"
+    "104:"  // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[4], [x20]\n"
+    "b 107f\n"
+    "105:"  // Oddments: Load (5, 3): Bit 2: Unset
+    "tbz x1, #1, 106f\n"
+    "ld1 { v2.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[2], [x20]\n"
+    "b 107f\n"
+    "106:"  // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[0], [x20]\n"
+    "107:"  // Oddments: Load (5, 3): Bit 2: End
+    "ldr d30, [x6, #0xb8]\n"
+    "usubl v2.8h, v2.8b, v18.8b\n"
+    "ssubl v30.8h, v30.8b, v13.8b\n"
+    "ldr x20, [x5, #0x110]\n"
+    "smlal v23.4s, v2.4h, v28.4h\n"
+    "smlal2 v19.4s, v2.8h, v28.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v29.4h, v30.4h\n"
+    "smlal2 v15.4s, v29.8h, v30.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal v24.4s, v2.4h, v30.4h\n"
+    "smlal2 v22.4s, v2.8h, v30.8h\n"
+    "tbz x1, #2, 109f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 108f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 111f\n"
+    "108:"  // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 111f\n"
+    "109:"  // Oddments: Load (5, 4): Bit 2: Unset
+    "tbz x1, #1, 110f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 111f\n"
+    "110:"  // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "111:"  // Oddments: Load (5, 4): Bit 2: End
+    "ldr d8, [x6, #0xc0]\n"
+    "usubl v27.8h, v27.8b, v18.8b\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x20, [x5, #0x118]\n"
+    "smlal v23.4s, v27.4h, v30.4h\n"
+    "smlal2 v19.4s, v27.8h, v30.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v25.4h, v8.4h\n"
+    "smlal2 v15.4s, v25.8h, v8.8h\n"
+    "smlal v20.4s, v21.4h, v8.4h\n"
+    "smlal2 v5.4s, v21.8h, v8.8h\n"
+    "smlal v24.4s, v27.4h, v8.4h\n"
+    "smlal2 v22.4s, v27.8h, v8.8h\n"
+    "tbz x1, #2, 113f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 112f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 115f\n"
+    "112:"  // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 115f\n"
+    "113:"  // Oddments: Load (5, 5): Bit 2: Unset
+    "tbz x1, #1, 114f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 115f\n"
+    "114:"  // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "115:"  // Oddments: Load (5, 5): Bit 2: End
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal v23.4s, v9.4h, v8.4h\n"
+    "smlal2 v19.4s, v9.8h, v8.8h\n"
+    "tbz x1, #2, 117f\n"
+    "ld1 { v30.4s }, [x7], #0x10\n"
+    "ld1 { v12.4s }, [x8], #0x10\n"
+    "tbz x1, #1, 116f\n"
+    "ld1 { v14.d }[0], [x7], #0x8\n"
+    "ld1 { v27.d }[0], [x8], #0x8\n"
+    "tbz x1, #0, 119f\n"
+    "ld1 { v14.s }[2], [x7]\n"
+    "ld1 { v27.s }[2], [x8]\n"
+    "b 119f\n"
+    "116:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 119f\n"
+    "ld1 { v14.s }[0], [x7]\n"
+    "ld1 { v27.s }[0], [x8]\n"
+    "b 119f\n"
+    "117:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x1, #1, 118f\n"
+    "ld1 { v30.d }[0], [x7], #0x8\n"
+    "ld1 { v12.d }[0], [x8], #0x8\n"
+    "tbz x1, #0, 119f\n"
+    "ld1 { v30.s }[2], [x7]\n"
+    "ld1 { v12.s }[2], [x8]\n"
+    "b 119f\n"
+    "118:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 119f\n"
+    "ld1 { v30.s }[0], [x7]\n"
+    "ld1 { v12.s }[0], [x8]\n"
+    "119:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v7.4s, v7.4s, v30.4s\n"
+    "and v16.16b, v7.16b, v12.16b\n"
+    "add x17, x17, x4\n"
+    "add x16, x16, x4\n"
+    "sqrdmulh v15.4s, v15.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add x15, x15, x4\n"
+    "add x14, x14, x4\n"
+    "and v2.16b, v15.16b, v27.16b\n"
+    "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+    "sqadd v7.4s, v7.4s, v16.4s\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "and v21.16b, v20.16b, v12.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+    "and v18.16b, v24.16b, v12.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v14.4s\n"
+    "and v31.16b, v23.16b, v12.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+    "sqadd v15.4s, v15.4s, v2.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v9.16b, v5.16b, v27.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "and v4.16b, v22.16b, v27.16b\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v28.16b, v19.16b, v27.16b\n"
+    "sqadd v20.4s, v20.4s, v21.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v18.4s\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v31.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v12.4s\n"
+    "srshl v20.4s, v20.4s, v12.4s\n"
+    "sqadd v5.4s, v5.4s, v9.4s\n"
+    "srshl v24.4s, v24.4s, v12.4s\n"
+    "sqadd v22.4s, v22.4s, v4.4s\n"
+    "srshl v23.4s, v23.4s, v12.4s\n"
+    "sqadd v19.4s, v19.4s, v28.4s\n"
+    "srshl v15.4s, v15.4s, v27.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v27.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v27.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v27.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "tbz x1, #2, 121f\n"
+    "st1 { v7.s }[0], [x17], #0x4\n"
+    "st1 { v20.s }[0], [x16], #0x4\n"
+    "st1 { v24.s }[0], [x15], #0x4\n"
+    "st1 { v23.s }[0], [x14], #0x4\n"
+    "tbz x1, #1, 120f\n"
+    "st1 { v7.h }[2], [x17], #0x2\n"
+    "st1 { v20.h }[2], [x16], #0x2\n"
+    "st1 { v24.h }[2], [x15], #0x2\n"
+    "st1 { v23.h }[2], [x14], #0x2\n"
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[6], [x17], #0x1\n"
+    "st1 { v20.b }[6], [x16], #0x1\n"
+    "st1 { v24.b }[6], [x15], #0x1\n"
+    "st1 { v23.b }[6], [x14], #0x1\n"
+    "b 123f\n"
+    "120:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[4], [x17], #0x1\n"
+    "st1 { v20.b }[4], [x16], #0x1\n"
+    "st1 { v24.b }[4], [x15], #0x1\n"
+    "st1 { v23.b }[4], [x14], #0x1\n"
+    "b 123f\n"
+    "121:"  // Oddments: Bit 2: Unset
+    "tbz x1, #1, 122f\n"
+    "st1 { v7.h }[0], [x17], #0x2\n"
+    "st1 { v20.h }[0], [x16], #0x2\n"
+    "st1 { v24.h }[0], [x15], #0x2\n"
+    "st1 { v23.h }[0], [x14], #0x2\n"
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[2], [x17], #0x1\n"
+    "st1 { v20.b }[2], [x16], #0x1\n"
+    "st1 { v24.b }[2], [x15], #0x1\n"
+    "st1 { v23.b }[2], [x14], #0x1\n"
+    "b 123f\n"
+    "122:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[0], [x17], #0x1\n"
+    "st1 { v20.b }[0], [x16], #0x1\n"
+    "st1 { v24.b }[0], [x15], #0x1\n"
+    "st1 { v23.b }[0], [x14], #0x1\n"
+    "123:"  // Oddments: Bit 2: End
+    "124:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..2c677d2f62
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+class a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+  KernelType kernel = a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl;
+
+  public:
+  a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<uint8_t, int8_t, uint8_t, int32_t>(9, arm_gemm::VLType::None) {}
+
+  KernelType get_kernel() const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c2bec4cdab
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  const arm_gemm::Requantize32& qp,
+  const unsigned int n_points,
+  const unsigned int n_channels
+)
+{
+  __asm__ __volatile__(
+    "lsr x9, %x[n_channels], #0x2\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v8.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v7.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v6.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v5.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v4.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "ld1r { v3.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v2.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "ld1r { v1.4s }, [x20]\n"
+    "mov x11, #0x0\n"
+    "cbz x9, 6f\n"
+    "1:"  // Channel loop
+    "movi v23.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x20, x11, #0x2\n"
+    "ldr q23, [%x[bias], x20]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "ldr s0, [%x[params]], #0x4\n"
+    "mov x25, %x[inptrs]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "subs x24, %x[n_points], #0x1\n"
+    "ldr s14, [x21, x11]\n"
+    "ldr s15, [x20, x11]\n"
+    "mov v24.16b, v23.16b\n"
+    "mov v25.16b, v23.16b\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s16, [x21, x11]\n"
+    "mov v26.16b, v23.16b\n"
+    "mov v27.16b, v23.16b\n"
+    "ldr s17, [x20, x11]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "ldr s18, [x21, x11]\n"
+    "ldr s19, [x20, x11]\n"
+    "mov v30.16b, v23.16b\n"
+    "mov v31.16b, v23.16b\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s20, [x21, x11]\n"
+    "ssubl v0.8h, v0.8b, v5.8b\n"
+    "usubl v14.8h, v14.8b, v6.8b\n"
+    "ldr s21, [x20, x11]\n"
+    "ldr x20, [x25], #0x8\n"
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr s22, [x20, x11]\n"
+    "usubl v17.8h, v17.8b, v6.8b\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "usubl v19.8h, v19.8b, v6.8b\n"
+    "usubl v20.8h, v20.8b, v6.8b\n"
+    "usubl v21.8h, v21.8b, v6.8b\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "ldp x23, x22, [x25], #0x10\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "ldr s14, [x23, x11]\n"
+    "ldr s15, [x22, x11]\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "ldr s16, [x21, x11]\n"
+    "ldr s17, [x20, x11]\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s18, [x21, x11]\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "ldr s19, [x20, x11]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "subs x24, x24, #0x1\n"
+    "ldr s0, [%x[params]], #0x4\n"
+    "ldr s20, [x21, x11]\n"
+    "ssubl v0.8h, v0.8b, v5.8b\n"
+    "usubl v14.8h, v14.8b, v6.8b\n"
+    "ldr s21, [x20, x11]\n"
+    "ldr x20, [x25], #0x8\n"
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr s22, [x20, x11]\n"
+    "usubl v17.8h, v17.8b, v6.8b\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "usubl v19.8h, v19.8b, v6.8b\n"
+    "usubl v20.8h, v20.8b, v6.8b\n"
+    "usubl v21.8h, v21.8b, v6.8b\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "cbz %x[rq_mul_ptr], 5f\n"
+    "lsl x20, x11, #0x2\n"
+    "ldr q2, [%x[rq_mul_ptr], x20]\n"
+    "ldr q1, [%x[rq_right_shift_ptr], x20]\n"
+    "cbz %x[rq_left_shift_ptr], 5f\n"
+    "ldr q3, [%x[rq_left_shift_ptr], x20]\n"
+    "5:"  // Channel loop: Load quantisation parameters: Done
+    "sshl v23.4s, v23.4s, v3.4s\n"
+    "sshl v24.4s, v24.4s, v3.4s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "sshl v25.4s, v25.4s, v3.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "and v18.16b, v23.16b, v1.16b\n"
+    "and v17.16b, v24.16b, v1.16b\n"
+    "and v16.16b, v25.16b, v1.16b\n"
+    "sshl v26.4s, v26.4s, v3.4s\n"
+    "sshl v27.4s, v27.4s, v3.4s\n"
+    "sshl v28.4s, v28.4s, v3.4s\n"
+    "sshl v29.4s, v29.4s, v3.4s\n"
+    "sshl v30.4s, v30.4s, v3.4s\n"
+    "sshl v31.4s, v31.4s, v3.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+    "sqadd v23.4s, v23.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v21.16b, v26.16b, v1.16b\n"
+    "and v20.16b, v27.16b, v1.16b\n"
+    "and v19.16b, v28.16b, v1.16b\n"
+    "and v18.16b, v29.16b, v1.16b\n"
+    "and v17.16b, v30.16b, v1.16b\n"
+    "and v16.16b, v31.16b, v1.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v21.4s\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v23.4s, v23.4s, v1.4s\n"
+    "srshl v24.4s, v24.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "srshl v27.4s, v27.4s, v1.4s\n"
+    "srshl v28.4s, v28.4s, v1.4s\n"
+    "srshl v29.4s, v29.4s, v1.4s\n"
+    "srshl v30.4s, v30.4s, v1.4s\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "add v23.4s, v23.4s, v4.4s\n"
+    "add v24.4s, v24.4s, v4.4s\n"
+    "add v25.4s, v25.4s, v4.4s\n"
+    "add v26.4s, v26.4s, v4.4s\n"
+    "add v27.4s, v27.4s, v4.4s\n"
+    "add v28.4s, v28.4s, v4.4s\n"
+    "add v29.4s, v29.4s, v4.4s\n"
+    "add v30.4s, v30.4s, v4.4s\n"
+    "add v31.4s, v31.4s, v4.4s\n"
+    "smax v23.4s, v23.4s, v8.4s\n"
+    "smax v24.4s, v24.4s, v8.4s\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v27.4s, v27.4s, v8.4s\n"
+    "smax v28.4s, v28.4s, v8.4s\n"
+    "smax v29.4s, v29.4s, v8.4s\n"
+    "smax v30.4s, v30.4s, v8.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smin v23.4s, v23.4s, v7.4s\n"
+    "smin v24.4s, v24.4s, v7.4s\n"
+    "smin v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v7.4s\n"
+    "smin v27.4s, v27.4s, v7.4s\n"
+    "smin v28.4s, v28.4s, v7.4s\n"
+    "smin v29.4s, v29.4s, v7.4s\n"
+    "smin v30.4s, v30.4s, v7.4s\n"
+    "smin v31.4s, v31.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s23, [x28, x11]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s24, [x27, x11]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s25, [x26, x11]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s26, [x25, x11]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s27, [x24, x11]\n"
+    "str s28, [x23, x11]\n"
+    "str s29, [x22, x11]\n"
+    "str s30, [x21, x11]\n"
+    "str s31, [x20, x11]\n"
+    "add x11, x11, #0x4\n"
+    "cmp x11, x9, LSL #2\n"
+    "blt 1b\n"
+    "6:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 24f\n"
+    "movi v23.4s, #0x0\n"
+    "cbz %x[bias], 9f\n"
+    "add x20, %x[bias], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ld1 { v23.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v23.s }[2], [x20], #0x4\n"
+    "b 8f\n"
+    "7:"  // Oddments: Load bias: Bit 1: Unset
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "8:"  // Oddments: Load bias: Bit 1: End
+    "9:"  // Oddments: Load bias: Done
+    "ldr s0, [%x[params]], #0x4\n"
+    "mov x10, %x[inptrs]\n"
+    "ldp x9, x28, [x10], #0x10\n"
+    "mov v24.16b, v23.16b\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "mov v25.16b, v23.16b\n"
+    "mov v26.16b, v23.16b\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "ldr x21, [x10], #0x8\n"
+    "mov v27.16b, v23.16b\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "mov v30.16b, v23.16b\n"
+    "add x9, x9, x11\n"
+    "add x28, x28, x11\n"
+    "mov v31.16b, v23.16b\n"
+    "ssubl v0.8h, v0.8b, v5.8b\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ldr h14, [x9], #0x2\n"
+    "ldr h15, [x28], #0x2\n"
+    "ldr h16, [x27], #0x2\n"
+    "ldr h17, [x26], #0x2\n"
+    "ldr h18, [x25], #0x2\n"
+    "ldr h19, [x24], #0x2\n"
+    "ldr h20, [x23], #0x2\n"
+    "ldr h21, [x22], #0x2\n"
+    "ldr h22, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v14.b }[2], [x9], #0x1\n"
+    "ld1 { v15.b }[2], [x28], #0x1\n"
+    "ld1 { v16.b }[2], [x27], #0x1\n"
+    "ld1 { v17.b }[2], [x26], #0x1\n"
+    "ld1 { v18.b }[2], [x25], #0x1\n"
+    "ld1 { v19.b }[2], [x24], #0x1\n"
+    "ld1 { v20.b }[2], [x23], #0x1\n"
+    "ld1 { v21.b }[2], [x22], #0x1\n"
+    "ld1 { v22.b }[2], [x21], #0x1\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load: Bit 1: Unset
+    "ldr b14, [x9], #0x1\n"
+    "ldr b15, [x28], #0x1\n"
+    "ldr b16, [x27], #0x1\n"
+    "ldr b17, [x26], #0x1\n"
+    "ldr b18, [x25], #0x1\n"
+    "ldr b19, [x24], #0x1\n"
+    "ldr b20, [x23], #0x1\n"
+    "ldr b21, [x22], #0x1\n"
+    "ldr b22, [x21], #0x1\n"
+    "11:"  // Oddments: Load: Bit 1: End
+    "subs x20, %x[n_points], #0x1\n"
+    "usubl v14.8h, v14.8b, v6.8b\n"
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "usubl v17.8h, v17.8b, v6.8b\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "usubl v19.8h, v19.8b, v6.8b\n"
+    "usubl v20.8h, v20.8b, v6.8b\n"
+    "usubl v21.8h, v21.8b, v6.8b\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "ble 15f\n"
+    "12:"  // Oddments: Planar loop
+    "ldp x9, x28, [x10], #0x10\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "ldr x21, [x10], #0x8\n"
+    "add x9, x9, x11\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "add x28, x28, x11\n"
+    "add x27, x27, x11\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "ldr s0, [%x[params]], #0x4\n"
+    "ssubl v0.8h, v0.8b, v5.8b\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr h14, [x9], #0x2\n"
+    "ldr h15, [x28], #0x2\n"
+    "ldr h16, [x27], #0x2\n"
+    "ldr h17, [x26], #0x2\n"
+    "ldr h18, [x25], #0x2\n"
+    "ldr h19, [x24], #0x2\n"
+    "ldr h20, [x23], #0x2\n"
+    "ldr h21, [x22], #0x2\n"
+    "ldr h22, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v14.b }[2], [x9], #0x1\n"
+    "ld1 { v15.b }[2], [x28], #0x1\n"
+    "ld1 { v16.b }[2], [x27], #0x1\n"
+    "ld1 { v17.b }[2], [x26], #0x1\n"
+    "ld1 { v18.b }[2], [x25], #0x1\n"
+    "ld1 { v19.b }[2], [x24], #0x1\n"
+    "ld1 { v20.b }[2], [x23], #0x1\n"
+    "ld1 { v21.b }[2], [x22], #0x1\n"
+    "ld1 { v22.b }[2], [x21], #0x1\n"
+    "b 14f\n"
+    "13:"  // Oddments: Planar loop: Load: Bit 1: Unset
+    "ldr b14, [x9], #0x1\n"
+    "ldr b15, [x28], #0x1\n"
+    "ldr b16, [x27], #0x1\n"
+    "ldr b17, [x26], #0x1\n"
+    "ldr b18, [x25], #0x1\n"
+    "ldr b19, [x24], #0x1\n"
+    "ldr b20, [x23], #0x1\n"
+    "ldr b21, [x22], #0x1\n"
+    "ldr b22, [x21], #0x1\n"
+    "14:"  // Oddments: Planar loop: Load: Bit 1: End
+    "subs x20, x20, #0x1\n"
+    "usubl v14.8h, v14.8b, v6.8b\n"
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "usubl v17.8h, v17.8b, v6.8b\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "usubl v19.8h, v19.8b, v6.8b\n"
+    "usubl v20.8h, v20.8b, v6.8b\n"
+    "usubl v21.8h, v21.8b, v6.8b\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "bgt 12b\n"
+    "15:"  // Oddments: Planar tail
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "cbz %x[rq_mul_ptr], 21f\n"
+    "add x22, %x[rq_mul_ptr], x11, LSL #2\n"
+    "add x21, %x[rq_right_shift_ptr], x11, LSL #2\n"
+    "add x20, %x[rq_left_shift_ptr], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v2.d }[0], [x22], #0x8\n"
+    "ld1 { v1.d }[0], [x21], #0x8\n"
+    "cbz %x[rq_left_shift_ptr], 16f\n"
+    "ld1 { v3.d }[0], [x20], #0x8\n"
+    "16:"  // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v2.s }[2], [x22], #0x4\n"
+    "ld1 { v1.s }[2], [x21], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 17f\n"
+    "ld1 { v3.s }[2], [x20], #0x4\n"
+    "17:"  // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
+    "b 20f\n"
+    "18:"  // Oddments: Load quantisation parameters: Bit 1: Unset
+    "ld1 { v2.s }[0], [x22], #0x4\n"
+    "ld1 { v1.s }[0], [x21], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 19f\n"
+    "ld1 { v3.s }[0], [x20], #0x4\n"
+    "19:"  // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
+    "20:"  // Oddments: Load quantisation parameters: Bit 1: End
+    "21:"  // Oddments: Load quantisation parameters: Done
+    "sshl v23.4s, v23.4s, v3.4s\n"
+    "sshl v24.4s, v24.4s, v3.4s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "sshl v25.4s, v25.4s, v3.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "add x28, x28, x11\n"
+    "and v18.16b, v23.16b, v1.16b\n"
+    "and v17.16b, v24.16b, v1.16b\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "and v16.16b, v25.16b, v1.16b\n"
+    "sshl v26.4s, v26.4s, v3.4s\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "sshl v27.4s, v27.4s, v3.4s\n"
+    "sshl v28.4s, v28.4s, v3.4s\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "sshl v29.4s, v29.4s, v3.4s\n"
+    "sshl v30.4s, v30.4s, v3.4s\n"
+    "add x21, x21, x11\n"
+    "add x20, x20, x11\n"
+    "sshl v31.4s, v31.4s, v3.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+    "sqadd v23.4s, v23.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v21.16b, v26.16b, v1.16b\n"
+    "and v20.16b, v27.16b, v1.16b\n"
+    "and v19.16b, v28.16b, v1.16b\n"
+    "and v18.16b, v29.16b, v1.16b\n"
+    "and v17.16b, v30.16b, v1.16b\n"
+    "and v16.16b, v31.16b, v1.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v21.4s\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v23.4s, v23.4s, v1.4s\n"
+    "srshl v24.4s, v24.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "srshl v27.4s, v27.4s, v1.4s\n"
+    "srshl v28.4s, v28.4s, v1.4s\n"
+    "srshl v29.4s, v29.4s, v1.4s\n"
+    "srshl v30.4s, v30.4s, v1.4s\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "add v23.4s, v23.4s, v4.4s\n"
+    "add v24.4s, v24.4s, v4.4s\n"
+    "add v25.4s, v25.4s, v4.4s\n"
+    "add v26.4s, v26.4s, v4.4s\n"
+    "add v27.4s, v27.4s, v4.4s\n"
+    "add v28.4s, v28.4s, v4.4s\n"
+    "add v29.4s, v29.4s, v4.4s\n"
+    "add v30.4s, v30.4s, v4.4s\n"
+    "add v31.4s, v31.4s, v4.4s\n"
+    "smax v23.4s, v23.4s, v8.4s\n"
+    "smax v24.4s, v24.4s, v8.4s\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v27.4s, v27.4s, v8.4s\n"
+    "smax v28.4s, v28.4s, v8.4s\n"
+    "smax v29.4s, v29.4s, v8.4s\n"
+    "smax v30.4s, v30.4s, v8.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smin v23.4s, v23.4s, v7.4s\n"
+    "smin v24.4s, v24.4s, v7.4s\n"
+    "smin v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v7.4s\n"
+    "smin v27.4s, v27.4s, v7.4s\n"
+    "smin v28.4s, v28.4s, v7.4s\n"
+    "smin v29.4s, v29.4s, v7.4s\n"
+    "smin v30.4s, v30.4s, v7.4s\n"
+    "smin v31.4s, v31.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "st1 { v23.h }[0], [x28], #0x2\n"
+    "st1 { v24.h }[0], [x27], #0x2\n"
+    "st1 { v25.h }[0], [x26], #0x2\n"
+    "st1 { v26.h }[0], [x25], #0x2\n"
+    "st1 { v27.h }[0], [x24], #0x2\n"
+    "st1 { v28.h }[0], [x23], #0x2\n"
+    "st1 { v29.h }[0], [x22], #0x2\n"
+    "st1 { v30.h }[0], [x21], #0x2\n"
+    "st1 { v31.h }[0], [x20], #0x2\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "st1 { v23.b }[2], [x28], #0x1\n"
+    "st1 { v24.b }[2], [x27], #0x1\n"
+    "st1 { v25.b }[2], [x26], #0x1\n"
+    "st1 { v26.b }[2], [x25], #0x1\n"
+    "st1 { v27.b }[2], [x24], #0x1\n"
+    "st1 { v28.b }[2], [x23], #0x1\n"
+    "st1 { v29.b }[2], [x22], #0x1\n"
+    "st1 { v30.b }[2], [x21], #0x1\n"
+    "st1 { v31.b }[2], [x20], #0x1\n"
+    "b 23f\n"
+    "22:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v23.b }[0], [x28], #0x1\n"
+    "st1 { v24.b }[0], [x27], #0x1\n"
+    "st1 { v25.b }[0], [x26], #0x1\n"
+    "st1 { v26.b }[0], [x25], #0x1\n"
+    "st1 { v27.b }[0], [x24], #0x1\n"
+    "st1 { v28.b }[0], [x23], #0x1\n"
+    "st1 { v29.b }[0], [x22], #0x1\n"
+    "st1 { v30.b }[0], [x21], #0x1\n"
+    "st1 { v31.b }[0], [x20], #0x1\n"
+    "23:"  // Oddments: Store: Bit 1: End
+    "24:"  // End
+    : [params] "+&r" (params)
+    : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..b7ba363b43
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+struct a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+  using Parent = GenericDepthfirstMultiplierKernelStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+  a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+  : Parent(2, 8, arm_gemm::VLType::None)
+  {
+  }
+  Parent::KernelType kernel = a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..ed99f1f642
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,1480 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const int8_t *weights,
+  const int32_t *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const int32_t *per_channel_left_shifts,
+  const int32_t *per_channel_muls,
+  const int32_t *per_channel_right_shifts,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "lsr x10, %x[n_output_channels], #0x2\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v13.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v12.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v11.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "ld1r { v10.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v9.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "ld1r { v8.4s }, [x20]\n"
+    "mov x9, #0x0\n"
+    "cbz x10, 9f\n"
+    "1:"  // Output channel loop
+    "movi v31.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x20, x9, #0x2\n"
+    "ldr q31, [%x[bias], x20]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz %x[rq_mul_ptr], 3f\n"
+    "lsl x20, x9, #0x2\n"
+    "ldr q9, [%x[rq_mul_ptr], x20]\n"
+    "ldr q8, [%x[rq_right_shift_ptr], x20]\n"
+    "cbz %x[rq_left_shift_ptr], 3f\n"
+    "ldr q10, [%x[rq_left_shift_ptr], x20]\n"
+    "3:"  // Output channel loop: Load quantization parameters: Done
+    "ldr s5, [%x[weights]], #0x4\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr d0, [x21, #0x0]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "cbz x23, 7f\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "ldr d3, [x21, #0x0]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "beq 5f\n"
+    "4:"  // Output channel loop: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "subs x23, x23, #0x1\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x21, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "ldr d3, [x21, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "bgt 4b\n"
+    "5:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 6f\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "b 8f\n"
+    "6:"  // Output channel loop: Odd tail
+    "ldp x20, x28, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x20, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldr d4, [x28, #0x0]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "b 8f\n"
+    "7:"  // Output channel loop: Single kernel point
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "8:"  // Output channel loop: Done
+    "add x9, x9, #0x4\n"
+    "cmp x9, x10, LSL #2\n"
+    "blt 1b\n"
+    "tst %x[n_output_channels], #0x3\n"
+    "beq 26f\n"
+    "9:"  // Output channel oddments
+    "movi v31.4s, #0x0\n"
+    "cbz %x[bias], 12f\n"
+    "add x20, %x[bias], x9, LSL #2\n"
+    "tbz %x[n_output_channels], #1, 10f\n"
+    "ld1 { v31.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #0, 11f\n"
+    "ld1 { v31.s }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Output channel oddments: Load bias: Bit 1: Unset
+    "ld1 { v31.s }[0], [x20]\n"
+    "11:"  // Output channel oddments: Load bias: Bit 1: End
+    "12:"  // Output channel oddments: Load bias: Done
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz %x[rq_mul_ptr], 18f\n"
+    "add x22, %x[rq_mul_ptr], x9, LSL #2\n"
+    "add x21, %x[rq_right_shift_ptr], x9, LSL #2\n"
+    "add x20, %x[rq_left_shift_ptr], x9, LSL #2\n"
+    "cbz %x[rq_left_shift_ptr], 15f\n"
+    "tbz %x[n_output_channels], #1, 13f\n"
+    "ld1 { v9.d }[0], [x22], #0x8\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #0, 14f\n"
+    "ld1 { v9.s }[2], [x22], #0x4\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 14f\n"
+    "13:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "14:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
+    "b 18f\n"
+    "15:"  // Output channel oddments: Load quantization parameters: No left shift
+    "tbz %x[n_output_channels], #1, 16f\n"
+    "ld1 { v9.d }[0], [x22], #0x8\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "tbz %x[n_output_channels], #0, 17f\n"
+    "ld1 { v9.s }[2], [x22], #0x4\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "b 17f\n"
+    "16:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "17:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
+    "18:"  // Output channel oddments: Load quantization parameters: Done
+    "ldr s5, [%x[weights]], #0x4\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr d0, [x21, #0x0]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "cbz x23, 22f\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "ldr d3, [x21, #0x0]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "beq 20f\n"
+    "19:"  // Output channel oddments: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "subs x23, x23, #0x1\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x21, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "ldr d3, [x21, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "bgt 19b\n"
+    "20:"  // Output channel oddments: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 21f\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "b 23f\n"
+    "21:"  // Output channel oddments: Odd tail
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d2, [x21, #0x0]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d1, [x20, #0x0]\n"
+    "ldr s0, [%x[weights]], #0x4\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "ssubl v0.8h, v0.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "smlal v16.4s, v0.4h, v2.h[0]\n"
+    "smlal v17.4s, v0.4h, v2.h[1]\n"
+    "smlal v18.4s, v0.4h, v2.h[2]\n"
+    "smlal v19.4s, v0.4h, v2.h[3]\n"
+    "smlal v20.4s, v0.4h, v2.h[4]\n"
+    "smlal v21.4s, v0.4h, v2.h[5]\n"
+    "smlal v22.4s, v0.4h, v2.h[6]\n"
+    "smlal v23.4s, v0.4h, v2.h[7]\n"
+    "smlal v24.4s, v0.4h, v1.h[0]\n"
+    "smlal v25.4s, v0.4h, v1.h[1]\n"
+    "smlal v26.4s, v0.4h, v1.h[2]\n"
+    "smlal v27.4s, v0.4h, v1.h[3]\n"
+    "smlal v28.4s, v0.4h, v1.h[4]\n"
+    "smlal v29.4s, v0.4h, v1.h[5]\n"
+    "smlal v30.4s, v0.4h, v1.h[6]\n"
+    "smlal v31.4s, v0.4h, v1.h[7]\n"
+    "b 23f\n"
+    "22:"  // Output channel oddments: Single kernel point
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "23:"  // Output channel oddments: Done
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "tbz %x[n_output_channels], #1, 24f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.h }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.h }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.h }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.h }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.h }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.h }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.h }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.h }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "add x9, x9, #0x2\n"
+    "st1 { v24.h }[0], [x27]\n"
+    "st1 { v25.h }[0], [x26]\n"
+    "st1 { v26.h }[0], [x25]\n"
+    "st1 { v27.h }[0], [x24]\n"
+    "st1 { v28.h }[0], [x23]\n"
+    "st1 { v29.h }[0], [x22]\n"
+    "st1 { v30.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "tbz %x[n_output_channels], #0, 25f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.b }[2], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.b }[2], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.b }[2], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.b }[2], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.b }[2], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.b }[2], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.b }[2], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.b }[2], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "st1 { v24.b }[2], [x27]\n"
+    "st1 { v25.b }[2], [x26]\n"
+    "st1 { v26.b }[2], [x25]\n"
+    "st1 { v27.b }[2], [x24]\n"
+    "st1 { v28.b }[2], [x23]\n"
+    "st1 { v29.b }[2], [x22]\n"
+    "st1 { v30.b }[2], [x21]\n"
+    "st1 { v31.b }[2], [x20]\n"
+    "b 25f\n"
+    "24:"  // Output channel oddments: Done: Store: Bit 1: Unset
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.b }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.b }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.b }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.b }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.b }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.b }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.b }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.b }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "st1 { v24.b }[0], [x27]\n"
+    "st1 { v25.b }[0], [x26]\n"
+    "st1 { v26.b }[0], [x25]\n"
+    "st1 { v27.b }[0], [x24]\n"
+    "st1 { v28.b }[0], [x23]\n"
+    "st1 { v29.b }[0], [x22]\n"
+    "st1 { v30.b }[0], [x21]\n"
+    "st1 { v31.b }[0], [x20]\n"
+    "25:"  // Output channel oddments: Done: Store: Bit 1: End
+    "26:"  // Done
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..2b6f70c089
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..2d558ade3f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,336 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x4, #0x0\n"
+    "mov x5, #0x0\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "1:"  // Tile loop
+    "str x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x2\n"
+    "str x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x20, x4, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x5, x6, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "add x17, x6, x6\n"
+    "add x7, x7, x20, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x16, x7, x21, LSL #1\n"
+    "add x15, x17, x6\n"
+    "add x14, x16, x21, LSL #1\n"
+    "add x13, x14, x21, LSL #1\n"
+    "cbnz x5, 2f\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "lsl x12, %x[n_channels], #0x1\n"
+    "mov x21, #0x4\n"
+    "mul x21, x21, x6\n"
+    "add x11, x16, x6, LSL #1\n"
+    "add x10, x7, x15, LSL #1\n"
+    "add x9, x16, x17, LSL #1\n"
+    "sub x20, x24, x5\n"
+    "add x28, x14, x6, LSL #1\n"
+    "sub x20, x20, #0x1\n"
+    "add x27, x13, x15, LSL #1\n"
+    "and x20, x20, #0x3fffff\n"
+    "add x26, x7, x6, LSL #1\n"
+    "orr x12, x12, x20, LSL #22\n"
+    "add x25, x7, x17, LSL #1\n"
+    "orr x12, x12, x21, LSL #38\n"
+    "add x24, x14, x17, LSL #1\n"
+    "add x23, x16, x15, LSL #1\n"
+    "add x22, x14, x15, LSL #1\n"
+    "add x21, x13, x6, LSL #1\n"
+    "add x20, x13, x17, LSL #1\n"
+    ".inst 0xf8ac497a  // rprfm pldonce, x12, [x11]\n"
+    ".inst 0xf8ac48fa  // rprfm pldonce, x12, [x7]\n"
+    ".inst 0xf8ac495a  // rprfm pldonce, x12, [x10]\n"
+    ".inst 0xf8ac493a  // rprfm pldonce, x12, [x9]\n"
+    ".inst 0xf8ac4b9a  // rprfm pldonce, x12, [x28]\n"
+    ".inst 0xf8ac49ba  // rprfm pldonce, x12, [x13]\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    ".inst 0xf8ac4a1a  // rprfm pldonce, x12, [x16]\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    ".inst 0xf8ac49da  // rprfm pldonce, x12, [x14]\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x26, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mov x20, #0x2\n"
+    "ld1h { z18.h }, p3/Z, [x8]\n"
+    "addvl x8, x8, #1\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "cnth x24\n"
+    ".inst 0xa040a100  // ld1h { z0.h-z3.h }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    ".inst 0xa040a104  // ld1h { z4.h-z7.h }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "mul x22, x4, x26\n"  // offset = tile_i * ld_output_row
+    "cmp x24, %x[n_channels]\n"
+    "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "madd x22, x5, x25, x22\n"  // offset += tile_j * ld_output_col
+    "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "mov x21, #0x0\n"
+    "mul x22, x22, x20\n"  // offset *= output_tile_size
+    "sub x20, XZR, x24\n"
+    "ld1h { z8.h }, p3/Z, [x8]\n"
+    "add x23, x23, x22, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "ld1h { z9.h }, p2/Z, [x16, x6, LSL #1]\n"
+    "addvl x8, x8, #1\n"
+    "add x22, x23, x26, LSL #1\n"
+    "ld1h { z10.h }, p2/Z, [x7]\n"
+    "ld1h { z11.h }, p2/Z, [x7, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x16, x17, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x14, x6, LSL #1]\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
+    "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
+    "whilelt p1.h, x24, %x[n_channels]\n"
+    "inch x21\n"
+    "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x13]\n"
+    "inch x24\n"
+    "ld1h { z18.h }, p3/Z, [x8]\n"
+    "addvl x8, x8, #1\n"
+    "mov p0.b, p2.b\n"
+    "inch x20\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z28.h, p3/M, z5.h, z12.h\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x7, x6, LSL #1]\n"
+    "fmla z30.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x7, x17, LSL #1]\n"
+    "addvl x7, x7, #1\n"
+    "fmla z31.h, p3/M, z3.h, z13.h\n"
+    "fmla z28.h, p3/M, z7.h, z13.h\n"
+    "fmla z29.h, p3/M, z6.h, z13.h\n"
+    "fmla z30.h, p3/M, z4.h, z13.h\n"
+    "fmla z31.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16]\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "fmla z29.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x16, x15, LSL #1]\n"
+    "addvl x16, x16, #1\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "fmla z29.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x14]\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "addvl x14, x14, #1\n"
+    "ld1h { z13.h }, p1/Z, [x14, x6, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "fmla z31.h, p3/M, z5.h, z10.h\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x6, LSL #1]\n"
+    ".inst 0xa040a100  // ld1h { z0.h-z3.h }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "fmla z29.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x17, LSL #1]\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "cmp x24, %x[n_channels]\n"
+    "fmla z30.h, p3/M, z7.h, z11.h\n"
+    "fmla z31.h, p3/M, z6.h, z11.h\n"
+    "addvl x13, x13, #1\n"
+    "ld1h { z11.h }, p1/Z, [x7, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p1/Z, [x16, x6, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p1/Z, [x7]\n"
+    "fmla z30.h, p3/M, z8.h, z12.h\n"
+    "fmla z31.h, p3/M, z7.h, z12.h\n"
+    ".inst 0xa040a104  // ld1h { z4.h-z7.h }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "ld1h { z12.h }, p1/Z, [x16, x17, LSL #1]\n"
+    "ld1h { z8.h }, p3/Z, [x8]\n"
+    "addvl x8, x8, #1\n"
+    ".inst 0xc170ca3c  // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+    "st1h { z28.h }, p0, [x23]\n"
+    "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+    "addvl x23, x23, #1\n"
+    "st1h { z30.h }, p0, [x22]\n"
+    "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+    "addvl x22, x22, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
+    "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x13]\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x5, x5, #0x1\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "add x20, x4, #0x1\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "cmp x5, x24\n"
+    "csel x4, x4, x20, LT\n"
+    "csel x5, x5, XZR, LT\n"
+    "cmp x4, x21\n"
+    "fmla z28.h, p3/M, z5.h, z12.h\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x7, x6, LSL #1]\n"
+    "fmla z30.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x7, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z13.h\n"
+    "fmla z28.h, p3/M, z7.h, z13.h\n"
+    "fmla z29.h, p3/M, z6.h, z13.h\n"
+    "fmla z30.h, p3/M, z4.h, z13.h\n"
+    "fmla z31.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16]\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "fmla z29.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x16, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "fmla z29.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x14]\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "fmla z31.h, p3/M, z5.h, z10.h\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x6, LSL #1]\n"
+    "fmla z29.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x17, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z11.h\n"
+    "fmla z31.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z6.h, z9.h\n"
+    "fmla z29.h, p3/M, z8.h, z10.h\n"
+    "fmla z30.h, p3/M, z8.h, z12.h\n"
+    "fmla z31.h, p3/M, z7.h, z12.h\n"
+    ".inst 0xc170ca3c  // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+    "st1h { z28.h }, p0, [x23]\n"
+    "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+    "st1h { z30.h }, p0, [x22]\n"
+    "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..415e344832
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[16];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[5];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[3];
+      inptrs[3] = input_ptrs[6];
+      inptrs[4] = input_ptrs[9];
+      inptrs[5] = input_ptrs[12];
+      inptrs[6] = input_ptrs[15];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[2];
+      inptrs[9] = input_ptrs[10];
+      inptrs[10] = input_ptrs[4];
+      inptrs[11] = input_ptrs[7];
+      inptrs[12] = input_ptrs[8];
+      inptrs[13] = input_ptrs[11];
+      inptrs[14] = input_ptrs[13];
+      inptrs[15] = input_ptrs[14];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "mov x15, #0x0\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "ldr x13, [x16, #0x20]\n"
+    "cnth x12\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ldp x11, x10, [x20, #0x0]\n"
+    "cmp x12, %x[n_channels]\n"
+    "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "sub x9, XZR, x12\n"
+    "ldp x28, x27, [x20, #0x10]\n"
+    "ld1h { z16.h }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    ".inst 0xa040a1c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    ".inst 0xa040a1c4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "ld1h { z8.h }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z28, z16\n fmla z28.h, p3/M, z4.h, z9.h\n"
+    "movprfx z29, z16\n fmla z29.h, p3/M, z3.h, z9.h\n"
+    "ldr x22, [x16, #0x28]\n"
+    "whilelt p1.h, x12, %x[n_channels]\n"
+    "movprfx z30, z16\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ldr x21, [x16, #0x30]\n"
+    "ld1h { z16.h }, p3/Z, [x14]\n"
+    "ldr x20, [x16, #0x38]\n"
+    "addvl x14, x14, #1\n"
+    "inch x9\n"
+    "ld1h { z9.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ldr x25, [x16, #0x48]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "ldr x26, [x16, #0x40]\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "ldr x24, [x16, #0x50]\n"
+    "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x23, [x16, #0x58]\n"
+    "fmla z28.h, p3/M, z5.h, z12.h\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "ldr x13, [x16, #0x60]\n"
+    "fmla z30.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z13.h\n"
+    "ldr x22, [x16, #0x68]\n"
+    "ldr x21, [x16, #0x70]\n"
+    "fmla z28.h, p3/M, z7.h, z13.h\n"
+    "fmla z29.h, p3/M, z6.h, z13.h\n"
+    "ldr x20, [x16, #0x78]\n"
+    "fmla z30.h, p3/M, z4.h, z13.h\n"
+    "fmla z31.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "fmla z29.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "fmla z29.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x13, [x16, #0x20]\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z13.h }, p1/Z, [x13, x12, LSL #1]\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "fmla z31.h, p3/M, z5.h, z10.h\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
+    ".inst 0xa040a1c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "fmla z29.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "inch x15\n"
+    "fmla z30.h, p3/M, z7.h, z11.h\n"
+    "fmla z31.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p1/Z, [x24, x12, LSL #1]\n"
+    "whilelt p2.h, x15, %x[n_channels]\n"
+    "fmla z28.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p1/Z, [x26, x12, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p1/Z, [x25, x12, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z12.h\n"
+    "fmla z31.h, p3/M, z7.h, z12.h\n"
+    "ld1h { z12.h }, p1/Z, [x23, x12, LSL #1]\n"
+    "inch x12\n"
+    ".inst 0xa040a1c4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "cmp x12, %x[n_channels]\n"
+    "ld1h { z8.h }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    ".inst 0xc171ca5c  // fclamp { z28.h-z31.h }, z18.h, z17.h\n"
+    "st1h { z28.h }, p0, [x11, x9, LSL #1]\n"
+    "st1h { z29.h }, p0, [x10, x9, LSL #1]\n"
+    "st1h { z30.h }, p0, [x28, x9, LSL #1]\n"
+    "st1h { z31.h }, p0, [x27, x9, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z28, z16\n fmla z28.h, p3/M, z4.h, z9.h\n"
+    "movprfx z29, z16\n fmla z29.h, p3/M, z3.h, z9.h\n"
+    "ldr x22, [x16, #0x28]\n"
+    "inch x9\n"
+    "movprfx z30, z16\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ldr x21, [x16, #0x30]\n"
+    "mov p0.b, p2.b\n"
+    "ldr x20, [x16, #0x38]\n"
+    "ld1h { z9.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ldr x25, [x16, #0x48]\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "ldr x26, [x16, #0x40]\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "ldr x24, [x16, #0x50]\n"
+    "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x23, [x16, #0x58]\n"
+    "fmla z28.h, p3/M, z5.h, z12.h\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "ldr x13, [x16, #0x60]\n"
+    "fmla z30.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z13.h\n"
+    "ldr x22, [x16, #0x68]\n"
+    "ldr x21, [x16, #0x70]\n"
+    "fmla z28.h, p3/M, z7.h, z13.h\n"
+    "fmla z29.h, p3/M, z6.h, z13.h\n"
+    "ldr x20, [x16, #0x78]\n"
+    "fmla z30.h, p3/M, z4.h, z13.h\n"
+    "fmla z31.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "fmla z29.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "fmla z29.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "fmla z31.h, p3/M, z5.h, z10.h\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z11.h\n"
+    "fmla z31.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z6.h, z9.h\n"
+    "fmla z29.h, p3/M, z8.h, z10.h\n"
+    "fmla z30.h, p3/M, z8.h, z12.h\n"
+    "fmla z31.h, p3/M, z7.h, z12.h\n"
+    ".inst 0xc171ca5c  // fclamp { z28.h-z31.h }, z18.h, z17.h\n"
+    "st1h { z28.h }, p0, [x11, x9, LSL #1]\n"
+    "st1h { z29.h }, p0, [x10, x9, LSL #1]\n"
+    "st1h { z30.h }, p0, [x28, x9, LSL #1]\n"
+    "st1h { z31.h }, p0, [x27, x9, LSL #1]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..f90fbc3906
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..3a7d1cb0b4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x2, #0x0\n"
+    "mov x3, #0x0\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "1:"  // Tile loop
+    "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x3\n"
+    "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x20, x2, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x3, x4, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "add x7, x4, x4\n"
+    "add x5, x5, x20, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x8, x5, x21, LSL #1\n"
+    "add x17, x7, x4\n"
+    "add x16, x8, x21, LSL #1\n"
+    "add x15, x17, x4\n"
+    "add x14, x16, x21, LSL #1\n"
+    "add x13, x14, x21, LSL #1\n"
+    "cbnz x3, 2f\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "lsl x12, %x[n_channels], #0x1\n"
+    "mov x28, #0x6\n"
+    "mul x28, x28, x4\n"
+    "add x27, x16, x7, LSL #1\n"
+    "add x26, x5, x15, LSL #1\n"
+    "add x25, x8, x7, LSL #1\n"
+    "sub x20, x9, x3\n"
+    "add x24, x13, x15, LSL #1\n"
+    "sub x20, x20, #0x1\n"
+    "add x23, x16, x4, LSL #1\n"
+    "and x20, x20, #0x3fffff\n"
+    "add x22, x5, x4, LSL #1\n"
+    "orr x12, x12, x20, LSL #22\n"
+    "add x21, x5, x17, LSL #1\n"
+    "orr x12, x12, x28, LSL #38\n"
+    "add x20, x16, x17, LSL #1\n"
+    "add x11, x8, x15, LSL #1\n"
+    "add x10, x14, x7, LSL #1\n"
+    "add x9, x14, x15, LSL #1\n"
+    "add x28, x13, x4, LSL #1\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    "add x27, x8, x4, LSL #1\n"
+    ".inst 0xf8ac48ba  // rprfm pldonce, x12, [x5]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    "add x26, x8, x17, LSL #1\n"
+    ".inst 0xf8ac49ba  // rprfm pldonce, x12, [x13]\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    "add x25, x13, x17, LSL #1\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    "add x24, x14, x4, LSL #1\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    "add x23, x5, x7, LSL #1\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    "add x22, x14, x17, LSL #1\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    "add x21, x16, x15, LSL #1\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "add x20, x13, x7, LSL #1\n"
+    ".inst 0xf8ac491a  // rprfm pldonce, x12, [x8]\n"
+    ".inst 0xf8ac497a  // rprfm pldonce, x12, [x11]\n"
+    ".inst 0xf8ac49da  // rprfm pldonce, x12, [x14]\n"
+    ".inst 0xf8ac495a  // rprfm pldonce, x12, [x10]\n"
+    ".inst 0xf8ac493a  // rprfm pldonce, x12, [x9]\n"
+    ".inst 0xf8ac4b9a  // rprfm pldonce, x12, [x28]\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    ".inst 0xf8ac4a1a  // rprfm pldonce, x12, [x16]\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mov x21, #0x3\n"
+    "ld1h { z18.h }, p3/Z, [x6]\n"
+    "addvl x6, x6, #1\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "cnth x26\n"
+    ".inst 0xa040a0c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    ".inst 0xa040a0c4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "mul x20, x2, x22\n"  // offset = tile_i * ld_output_row
+    "cmp x26, %x[n_channels]\n"
+    "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "madd x20, x3, x27, x20\n"  // offset += tile_j * ld_output_col
+    "add x24, x27, x27\n"
+    "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "mul x20, x20, x21\n"  // offset *= output_tile_size
+    "mov x21, #0x0\n"
+    "ld1h { z8.h }, p3/Z, [x6]\n"
+    "add x25, x25, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "sub x20, XZR, x26\n"
+    "ld1h { z9.h }, p2/Z, [x16, x7, LSL #1]\n"
+    "add x23, x25, x22, LSL #1\n"
+    "ld1h { z10.h }, p2/Z, [x5]\n"
+    "addvl x6, x6, #1\n"
+    "add x22, x23, x22, LSL #1\n"
+    "ld1h { z11.h }, p2/Z, [x5, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x13]\n"
+    "ld1h { z13.h }, p2/Z, [x8, x7, LSL #1]\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
+    "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
+    "whilelt p1.h, x26, %x[n_channels]\n"
+    "inch x21\n"
+    "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
+    "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
+    "inch x26\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
+    "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
+    "inch x20\n"
+    "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
+    "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "fmla z24.h, p3/M, z4.h, z13.h\n"
+    "fmla z23.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x16, x17, LSL #1]\n"
+    "fmla z25.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16, x4, LSL #1]\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "fmla z27.h, p3/M, z1.h, z13.h\n"
+    "fmla z28.h, p3/M, z0.h, z13.h\n"
+    "fmla z29.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "fmla z23.h, p3/M, z5.h, z13.h\n"
+    "ld1h { z18.h }, p3/Z, [x6]\n"
+    "addvl x6, x6, #1\n"
+    "fmla z25.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x5, x4, LSL #1]\n"
+    "fmla z26.h, p3/M, z4.h, z11.h\n"
+    "fmla z31.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x5, x17, LSL #1]\n"
+    "fmla z27.h, p3/M, z3.h, z11.h\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z29.h, p3/M, z1.h, z11.h\n"
+    "fmla z24.h, p3/M, z0.h, z13.h\n"
+    "fmla z23.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x8]\n"
+    "fmla z25.h, p3/M, z1.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "fmla z31.h, p3/M, z1.h, z10.h\n"
+    "fmla z27.h, p3/M, z5.h, z10.h\n"
+    "fmla z30.h, p3/M, z2.h, z10.h\n"
+    "fmla z26.h, p3/M, z0.h, z11.h\n"
+    "fmla z24.h, p3/M, z2.h, z12.h\n"
+    "fmla z23.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x8, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x14]\n"
+    "fmla z25.h, p3/M, z7.h, z10.h\n"
+    "fmla z28.h, p3/M, z2.h, z13.h\n"
+    "fmla z24.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z23.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x8, x4, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "fmla z31.h, p3/M, z3.h, z10.h\n"
+    "fmla z25.h, p3/M, z5.h, z13.h\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z13.h }, p2/Z, [x13, x4, LSL #1]\n"
+    "fmla z28.h, p3/M, z6.h, z10.h\n"
+    "fmla z29.h, p3/M, z5.h, z10.h\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "fmla z24.h, p3/M, z3.h, z12.h\n"
+    "fmla z31.h, p3/M, z5.h, z11.h\n"
+    "fmla z30.h, p3/M, z6.h, z13.h\n"
+    "fmla z27.h, p3/M, z0.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "fmla z29.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x13, x17, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x8, x17, LSL #1]\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "addvl x8, x8, #1\n"
+    "ld1h { z12.h }, p2/Z, [x14, x4, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z13.h\n"
+    "fmla z31.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z24.h, p3/M, z5.h, z11.h\n"
+    "fmla z25.h, p3/M, z4.h, z11.h\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x5, x7, LSL #1]\n"
+    "addvl x5, x5, #1\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z26.h, p3/M, z7.h, z12.h\n"
+    "ld1h { z10.h }, p1/Z, [x5]\n"
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "fmla z24.h, p3/M, z1.h, z11.h\n"
+    "fmla z27.h, p3/M, z6.h, z12.h\n"
+    "fmla z25.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z12.h }, p2/Z, [x16]\n"
+    "ld1h { z11.h }, p2/Z, [x16, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z13.h\n"
+    "addvl x16, x16, #1\n"
+    "fmla z30.h, p3/M, z5.h, z13.h\n"
+    "ld1h { z9.h }, p1/Z, [x16, x7, LSL #1]\n"
+    "fmla z23.h, p3/M, z6.h, z12.h\n"
+    "fmla z29.h, p3/M, z0.h, z12.h\n"
+    "fmla z26.h, p3/M, z3.h, z12.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    ".inst 0xa040a0c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "fmla z27.h, p3/M, z8.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x13, x7, LSL #1]\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "addvl x13, x13, #1\n"
+    "cmp x26, %x[n_channels]\n"
+    "ld1h { z11.h }, p1/Z, [x5, x15, LSL #1]\n"
+    "fmax z23.h, p3/M, z23.h, z17.h\n"
+    "ld1h { z12.h }, p1/Z, [x13]\n"
+    "fmla z29.h, p3/M, z8.h, z13.h\n"
+    "fmla z30.h, p3/M, z7.h, z13.h\n"
+    "fmla z31.h, p3/M, z6.h, z13.h\n"
+    ".inst 0xa040a0c4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    ".inst 0xc170ca38  // fclamp { z24.h-z27.h }, z17.h, z16.h\n"
+    "ld1h { z13.h }, p1/Z, [x8, x7, LSL #1]\n"
+    "ld1h { z8.h }, p3/Z, [x6]\n"
+    "addvl x6, x6, #1\n"
+    "fmin z23.h, p3/M, z23.h, z16.h\n"
+    ".inst 0xc170ca3c  // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+    "st1h { z26.h }, p0, [x23]\n"
+    "st1h { z27.h }, p0, [x23, x27, LSL #1]\n"
+    "st1h { z23.h }, p0, [x25]\n"
+    "st1h { z24.h }, p0, [x25, x27, LSL #1]\n"
+    "st1h { z25.h }, p0, [x25, x24, LSL #1]\n"
+    "addvl x25, x25, #1\n"
+    "st1h { z28.h }, p0, [x23, x24, LSL #1]\n"
+    "addvl x23, x23, #1\n"
+    "st1h { z29.h }, p0, [x22]\n"
+    "st1h { z30.h }, p0, [x22, x27, LSL #1]\n"
+    "st1h { z31.h }, p0, [x22, x24, LSL #1]\n"
+    "addvl x22, x22, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
+    "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
+    "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
+    "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
+    "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x3, x3, #0x1\n"
+    "fmla z24.h, p3/M, z4.h, z13.h\n"
+    "fmla z23.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x16, x17, LSL #1]\n"
+    "add x20, x2, #0x1\n"
+    "fmla z25.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16, x4, LSL #1]\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "cmp x3, x9\n"
+    "fmla z27.h, p3/M, z1.h, z13.h\n"
+    "fmla z28.h, p3/M, z0.h, z13.h\n"
+    "csel x2, x2, x20, LT\n"
+    "csel x3, x3, XZR, LT\n"
+    "fmla z29.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "cmp x2, x21\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "fmla z23.h, p3/M, z5.h, z13.h\n"
+    "fmla z25.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x5, x4, LSL #1]\n"
+    "fmla z26.h, p3/M, z4.h, z11.h\n"
+    "fmla z31.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x5, x17, LSL #1]\n"
+    "fmla z27.h, p3/M, z3.h, z11.h\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z29.h, p3/M, z1.h, z11.h\n"
+    "fmla z24.h, p3/M, z0.h, z13.h\n"
+    "fmla z23.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x8]\n"
+    "fmla z25.h, p3/M, z1.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "fmla z31.h, p3/M, z1.h, z10.h\n"
+    "fmla z27.h, p3/M, z5.h, z10.h\n"
+    "fmla z30.h, p3/M, z2.h, z10.h\n"
+    "fmla z26.h, p3/M, z0.h, z11.h\n"
+    "fmla z24.h, p3/M, z2.h, z12.h\n"
+    "fmla z23.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x8, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x14]\n"
+    "fmla z25.h, p3/M, z7.h, z10.h\n"
+    "fmla z28.h, p3/M, z2.h, z13.h\n"
+    "fmla z24.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z23.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x8, x4, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "fmla z31.h, p3/M, z3.h, z10.h\n"
+    "fmla z25.h, p3/M, z5.h, z13.h\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z13.h }, p2/Z, [x13, x4, LSL #1]\n"
+    "fmla z28.h, p3/M, z6.h, z10.h\n"
+    "fmla z29.h, p3/M, z5.h, z10.h\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "fmla z24.h, p3/M, z3.h, z12.h\n"
+    "fmla z31.h, p3/M, z5.h, z11.h\n"
+    "fmla z30.h, p3/M, z6.h, z13.h\n"
+    "fmla z27.h, p3/M, z0.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "fmla z29.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x13, x17, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x8, x17, LSL #1]\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x4, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z13.h\n"
+    "fmla z31.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z24.h, p3/M, z5.h, z11.h\n"
+    "fmla z25.h, p3/M, z4.h, z11.h\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x5, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z26.h, p3/M, z7.h, z12.h\n"
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "fmla z24.h, p3/M, z1.h, z11.h\n"
+    "fmla z27.h, p3/M, z6.h, z12.h\n"
+    "fmla z25.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z12.h }, p2/Z, [x16]\n"
+    "ld1h { z11.h }, p2/Z, [x16, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z13.h\n"
+    "fmla z30.h, p3/M, z5.h, z13.h\n"
+    "fmla z23.h, p3/M, z6.h, z12.h\n"
+    "fmla z29.h, p3/M, z0.h, z12.h\n"
+    "fmla z26.h, p3/M, z3.h, z12.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "fmla z27.h, p3/M, z8.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x13, x7, LSL #1]\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "fmax z23.h, p3/M, z23.h, z17.h\n"
+    "fmla z29.h, p3/M, z8.h, z13.h\n"
+    "fmla z30.h, p3/M, z7.h, z13.h\n"
+    "fmla z31.h, p3/M, z6.h, z13.h\n"
+    ".inst 0xc170ca38  // fclamp { z24.h-z27.h }, z17.h, z16.h\n"
+    "fmin z23.h, p3/M, z23.h, z16.h\n"
+    ".inst 0xc170ca3c  // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+    "st1h { z26.h }, p0, [x23]\n"
+    "st1h { z27.h }, p0, [x23, x27, LSL #1]\n"
+    "st1h { z23.h }, p0, [x25]\n"
+    "st1h { z24.h }, p0, [x25, x27, LSL #1]\n"
+    "st1h { z25.h }, p0, [x25, x24, LSL #1]\n"
+    "st1h { z28.h }, p0, [x23, x24, LSL #1]\n"
+    "st1h { z29.h }, p0, [x22]\n"
+    "st1h { z30.h }, p0, [x22, x27, LSL #1]\n"
+    "st1h { z31.h }, p0, [x22, x24, LSL #1]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..e85cb9e017
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[25];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[4];
+      inptrs[3] = input_ptrs[20];
+      inptrs[4] = input_ptrs[7];
+      inptrs[5] = input_ptrs[24];
+      inptrs[6] = input_ptrs[11];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[3];
+      inptrs[9] = input_ptrs[13];
+      inptrs[10] = input_ptrs[5];
+      inptrs[11] = input_ptrs[9];
+      inptrs[12] = input_ptrs[15];
+      inptrs[13] = input_ptrs[17];
+      inptrs[14] = input_ptrs[19];
+      inptrs[15] = input_ptrs[21];
+      inptrs[16] = input_ptrs[6];
+      inptrs[17] = input_ptrs[8];
+      inptrs[18] = input_ptrs[23];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[18];
+      inptrs[22] = input_ptrs[10];
+      inptrs[23] = input_ptrs[14];
+      inptrs[24] = input_ptrs[22];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "mov x15, #0x0\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "ldp x14, x13, [x16, #0x0]\n"
+    "ldp x12, x11, [x16, #0x10]\n"
+    "cnth x10\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1h { z17.h }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "ldr x9, [x16, #0x20]\n"
+    "cmp x10, %x[n_channels]\n"
+    ".inst 0xa040a220  // ld1h { z0.h-z3.h }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "ldr x28, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "sub x27, XZR, x10\n"
+    ".inst 0xa040a224  // ld1h { z4.h-z7.h }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z8.h }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "ld1h { z9.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x9, x15, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z23, z17\n fmla z23.h, p3/M, z8.h, z9.h\n"
+    "movprfx z24, z17\n fmla z24.h, p3/M, z7.h, z9.h\n"
+    "ldr x26, [x16, #0x30]\n"
+    "inch x27\n"
+    "movprfx z25, z17\n fmla z25.h, p3/M, z6.h, z9.h\n"
+    "movprfx z26, z17\n fmla z26.h, p3/M, z5.h, z9.h\n"
+    "ldr x25, [x16, #0x38]\n"
+    "mov p1.b, p2.b\n"
+    "movprfx z27, z17\n fmla z27.h, p3/M, z4.h, z9.h\n"
+    "movprfx z28, z17\n fmla z28.h, p3/M, z3.h, z9.h\n"
+    "ldr x24, [x16, #0x28]\n"
+    "whilelt p0.h, x10, %x[n_channels]\n"
+    "movprfx z29, z17\n fmla z29.h, p3/M, z2.h, z9.h\n"
+    "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ldr x13, [x16, #0x48]\n"
+    "fmla z23.h, p3/M, z0.h, z10.h\n"
+    "fmla z24.h, p3/M, z4.h, z13.h\n"
+    "ldr x14, [x16, #0x40]\n"
+    "fmla z25.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "ldr x12, [x16, #0x50]\n"
+    "fmla z27.h, p3/M, z1.h, z13.h\n"
+    "fmla z28.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x11, [x16, #0x58]\n"
+    "fmla z29.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "ldr x9, [x16, #0x60]\n"
+    "fmla z23.h, p3/M, z5.h, z13.h\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "ldr x24, [x16, #0x68]\n"
+    "ld1h { z17.h }, p3/Z, [x17]\n"
+    "fmla z25.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z4.h, z11.h\n"
+    "ldr x26, [x16, #0x70]\n"
+    "fmla z31.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z3.h, z11.h\n"
+    "ldr x25, [x16, #0x78]\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "ldr x14, [x16, #0x80]\n"
+    "addvl x17, x17, #1\n"
+    "fmla z23.h, p3/M, z7.h, z11.h\n"
+    "fmla z24.h, p3/M, z0.h, z13.h\n"
+    "ldr x13, [x16, #0x88]\n"
+    "fmla z29.h, p3/M, z1.h, z11.h\n"
+    "fmla z25.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldr x12, [x16, #0x90]\n"
+    "fmla z27.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z1.h, z10.h\n"
+    "ldr x23, [x28, #0x0]\n"
+    "fmla z30.h, p3/M, z2.h, z10.h\n"
+    "ldr x22, [x28, #0x8]\n"
+    "fmla z23.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z24.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z0.h, z11.h\n"
+    "ldr x9, [x16, #0xa0]\n"
+    "fmla z25.h, p3/M, z7.h, z10.h\n"
+    "ldr x11, [x16, #0x98]\n"
+    "fmla z28.h, p3/M, z2.h, z13.h\n"
+    "ldr x21, [x28, #0x10]\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z23.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0xb0]\n"
+    "fmla z24.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ldr x24, [x16, #0xa8]\n"
+    "fmla z25.h, p3/M, z5.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x16, #0xb8]\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "fmla z28.h, p3/M, z6.h, z10.h\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "fmla z29.h, p3/M, z5.h, z10.h\n"
+    "ldr x14, [x16, #0xc0]\n"
+    "fmla z31.h, p3/M, z3.h, z10.h\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "ldr x20, [x28, #0x18]\n"
+    "fmla z24.h, p3/M, z3.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "fmla z27.h, p3/M, z0.h, z12.h\n"
+    "fmla z30.h, p3/M, z6.h, z13.h\n"
+    "fmla z29.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z24.h, p3/M, z5.h, z11.h\n"
+    "fmla z25.h, p3/M, z4.h, z11.h\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x9, x15, LSL #1]\n"
+    "ldr x9, [x16, #0x20]\n"
+    "fmla z30.h, p3/M, z8.h, z13.h\n"
+    "fmla z26.h, p3/M, z7.h, z12.h\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z31.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "fmla z24.h, p3/M, z1.h, z11.h\n"
+    "fmla z27.h, p3/M, z6.h, z12.h\n"
+    "fmla z25.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z13.h\n"
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "fmla z23.h, p3/M, z6.h, z12.h\n"
+    "fmla z29.h, p3/M, z0.h, z12.h\n"
+    "fmla z27.h, p3/M, z8.h, z13.h\n"
+    "fmla z26.h, p3/M, z3.h, z12.h\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "fmla z30.h, p3/M, z5.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ldp x14, x13, [x16, #0x0]\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "fmax z23.h, p3/M, z23.h, z18.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ldp x12, x11, [x16, #0x10]\n"
+    "inch x15\n"
+    ".inst 0xa040a220  // ld1h { z0.h-z3.h }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "whilelt p2.h, x15, %x[n_channels]\n"
+    "fmla z29.h, p3/M, z8.h, z13.h\n"
+    ".inst 0xc170ca58  // fclamp { z24.h-z27.h }, z18.h, z16.h\n"
+    "ld1h { z9.h }, p0/Z, [x14, x10, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z10.h }, p0/Z, [x13, x10, LSL #1]\n"
+    "fmin z23.h, p3/M, z23.h, z16.h\n"
+    "fmla z31.h, p3/M, z6.h, z13.h\n"
+    "ld1h { z11.h }, p0/Z, [x12, x10, LSL #1]\n"
+    "ld1h { z12.h }, p0/Z, [x11, x10, LSL #1]\n"
+    "st1h { z24.h }, p1, [x22, x27, LSL #1]\n"
+    "ldr x22, [x28, #0x28]\n"
+    "st1h { z25.h }, p1, [x21, x27, LSL #1]\n"
+    "ldr x21, [x28, #0x30]\n"
+    "ld1h { z13.h }, p0/Z, [x9, x10, LSL #1]\n"
+    "inch x10\n"
+    "st1h { z23.h }, p1, [x23, x27, LSL #1]\n"
+    "ldr x23, [x28, #0x20]\n"
+    ".inst 0xa040a224  // ld1h { z4.h-z7.h }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "st1h { z26.h }, p1, [x20, x27, LSL #1]\n"
+    "ldr x20, [x28, #0x38]\n"
+    "cmp x10, %x[n_channels]\n"
+    ".inst 0xc170ca5c  // fclamp { z28.h-z31.h }, z18.h, z16.h\n"
+    "ld1h { z8.h }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "st1h { z27.h }, p1, [x23, x27, LSL #1]\n"
+    "ldr x23, [x28, #0x40]\n"
+    "st1h { z28.h }, p1, [x22, x27, LSL #1]\n"
+    "st1h { z29.h }, p1, [x21, x27, LSL #1]\n"
+    "st1h { z30.h }, p1, [x20, x27, LSL #1]\n"
+    "st1h { z31.h }, p1, [x23, x27, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z23, z17\n fmla z23.h, p3/M, z8.h, z9.h\n"
+    "movprfx z24, z17\n fmla z24.h, p3/M, z7.h, z9.h\n"
+    "ldr x26, [x16, #0x30]\n"
+    "inch x27\n"
+    "movprfx z25, z17\n fmla z25.h, p3/M, z6.h, z9.h\n"
+    "movprfx z26, z17\n fmla z26.h, p3/M, z5.h, z9.h\n"
+    "ldr x25, [x16, #0x38]\n"
+    "mov p1.b, p2.b\n"
+    "movprfx z27, z17\n fmla z27.h, p3/M, z4.h, z9.h\n"
+    "movprfx z28, z17\n fmla z28.h, p3/M, z3.h, z9.h\n"
+    "ldr x24, [x16, #0x28]\n"
+    "movprfx z29, z17\n fmla z29.h, p3/M, z2.h, z9.h\n"
+    "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ldr x13, [x16, #0x48]\n"
+    "fmla z23.h, p3/M, z0.h, z10.h\n"
+    "fmla z24.h, p3/M, z4.h, z13.h\n"
+    "ldr x14, [x16, #0x40]\n"
+    "fmla z25.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "ldr x12, [x16, #0x50]\n"
+    "fmla z27.h, p3/M, z1.h, z13.h\n"
+    "fmla z28.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x11, [x16, #0x58]\n"
+    "fmla z29.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "ldr x9, [x16, #0x60]\n"
+    "fmla z23.h, p3/M, z5.h, z13.h\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "ldr x24, [x16, #0x68]\n"
+    "fmla z25.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z4.h, z11.h\n"
+    "ldr x26, [x16, #0x70]\n"
+    "fmla z31.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z3.h, z11.h\n"
+    "ldr x25, [x16, #0x78]\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "ldr x14, [x16, #0x80]\n"
+    "fmla z23.h, p3/M, z7.h, z11.h\n"
+    "fmla z24.h, p3/M, z0.h, z13.h\n"
+    "ldr x13, [x16, #0x88]\n"
+    "fmla z29.h, p3/M, z1.h, z11.h\n"
+    "fmla z25.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldr x12, [x16, #0x90]\n"
+    "fmla z27.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z1.h, z10.h\n"
+    "ldr x23, [x28, #0x0]\n"
+    "fmla z30.h, p3/M, z2.h, z10.h\n"
+    "ldr x22, [x28, #0x8]\n"
+    "fmla z23.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z24.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z0.h, z11.h\n"
+    "ldr x9, [x16, #0xa0]\n"
+    "fmla z25.h, p3/M, z7.h, z10.h\n"
+    "ldr x11, [x16, #0x98]\n"
+    "fmla z28.h, p3/M, z2.h, z13.h\n"
+    "ldr x21, [x28, #0x10]\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z23.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0xb0]\n"
+    "fmla z24.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ldr x24, [x16, #0xa8]\n"
+    "fmla z25.h, p3/M, z5.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x16, #0xb8]\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "fmla z28.h, p3/M, z6.h, z10.h\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "fmla z29.h, p3/M, z5.h, z10.h\n"
+    "ldr x14, [x16, #0xc0]\n"
+    "fmla z31.h, p3/M, z3.h, z10.h\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "ldr x20, [x28, #0x18]\n"
+    "fmla z24.h, p3/M, z3.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "fmla z27.h, p3/M, z0.h, z12.h\n"
+    "fmla z30.h, p3/M, z6.h, z13.h\n"
+    "fmla z29.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z24.h, p3/M, z5.h, z11.h\n"
+    "fmla z25.h, p3/M, z4.h, z11.h\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x9, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z13.h\n"
+    "fmla z26.h, p3/M, z7.h, z12.h\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z31.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "fmla z24.h, p3/M, z1.h, z11.h\n"
+    "fmla z27.h, p3/M, z6.h, z12.h\n"
+    "fmla z25.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z13.h\n"
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "fmla z23.h, p3/M, z6.h, z12.h\n"
+    "fmla z29.h, p3/M, z0.h, z12.h\n"
+    "fmla z27.h, p3/M, z8.h, z13.h\n"
+    "fmla z26.h, p3/M, z3.h, z12.h\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "fmla z30.h, p3/M, z5.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "fmax z23.h, p3/M, z23.h, z18.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "fmla z29.h, p3/M, z8.h, z13.h\n"
+    ".inst 0xc170ca58  // fclamp { z24.h-z27.h }, z18.h, z16.h\n"
+    "fmla z30.h, p3/M, z7.h, z13.h\n"
+    "fmin z23.h, p3/M, z23.h, z16.h\n"
+    "fmla z31.h, p3/M, z6.h, z13.h\n"
+    "st1h { z24.h }, p1, [x22, x27, LSL #1]\n"
+    "ldr x22, [x28, #0x28]\n"
+    "st1h { z25.h }, p1, [x21, x27, LSL #1]\n"
+    "ldr x21, [x28, #0x30]\n"
+    "st1h { z26.h }, p1, [x20, x27, LSL #1]\n"
+    "ldr x20, [x28, #0x38]\n"
+    "st1h { z23.h }, p1, [x23, x27, LSL #1]\n"
+    "ldr x23, [x28, #0x20]\n"
+    ".inst 0xc170ca5c  // fclamp { z28.h-z31.h }, z18.h, z16.h\n"
+    "st1h { z27.h }, p1, [x23, x27, LSL #1]\n"
+    "ldr x23, [x28, #0x40]\n"
+    "st1h { z28.h }, p1, [x22, x27, LSL #1]\n"
+    "st1h { z29.h }, p1, [x21, x27, LSL #1]\n"
+    "st1h { z30.h }, p1, [x20, x27, LSL #1]\n"
+    "st1h { z31.h }, p1, [x23, x27, LSL #1]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6b75d12295
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 4;
+
+  sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..37a9febf47
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,672 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x1, #0x0\n"
+    "mov x2, #0x0\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "1:"  // Tile loop
+    "str x1, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x4\n"
+    "str x2, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x20, x1, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x5, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x2, x3, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "add x6, x3, x3\n"
+    "add x4, x4, x20, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x7, x4, x21, LSL #1\n"
+    "add x8, x6, x3\n"
+    "add x17, x7, x21, LSL #1\n"
+    "add x16, x8, x3\n"
+    "add x15, x17, x21, LSL #1\n"
+    "add x14, x16, x3\n"
+    "add x13, x15, x21, LSL #1\n"
+    "add x12, x13, x21, LSL #1\n"
+    "cbnz x2, 2f\n"
+    "ldr x11, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "lsl x10, %x[n_channels], #0x1\n"
+    "mov x21, #0x8\n"
+    "mul x21, x21, x3\n"
+    "add x9, x17, x6, LSL #1\n"
+    "add x28, x4, x14, LSL #1\n"
+    "add x27, x17, x8, LSL #1\n"
+    "sub x20, x11, x2\n"
+    "add x26, x12, x14, LSL #1\n"
+    "sub x20, x20, #0x1\n"
+    "add x25, x15, x6, LSL #1\n"
+    "and x20, x20, #0x3fffff\n"
+    "add x24, x4, x3, LSL #1\n"
+    "orr x10, x10, x20, LSL #22\n"
+    "add x23, x4, x16, LSL #1\n"
+    "orr x10, x10, x21, LSL #38\n"
+    "add x22, x15, x8, LSL #1\n"
+    "add x21, x7, x14, LSL #1\n"
+    "add x20, x7, x6, LSL #1\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    "add x9, x13, x14, LSL #1\n"
+    ".inst 0xf8aa489a  // rprfm pldonce, x10, [x4]\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    "add x28, x7, x8, LSL #1\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    "add x27, x12, x3, LSL #1\n"
+    ".inst 0xf8aa499a  // rprfm pldonce, x10, [x12]\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    "add x26, x17, x3, LSL #1\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    "add x25, x12, x16, LSL #1\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    "add x24, x17, x16, LSL #1\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    "add x23, x4, x6, LSL #1\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    "add x22, x15, x3, LSL #1\n"
+    ".inst 0xf8aa48fa  // rprfm pldonce, x10, [x7]\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    "add x21, x4, x8, LSL #1\n"
+    ".inst 0xf8aa49ba  // rprfm pldonce, x10, [x13]\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "add x20, x15, x16, LSL #1\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    "add x9, x17, x14, LSL #1\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    "add x28, x13, x6, LSL #1\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    "add x27, x15, x14, LSL #1\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    "add x26, x12, x6, LSL #1\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    "add x25, x13, x8, LSL #1\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    "add x24, x12, x8, LSL #1\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    "add x23, x7, x3, LSL #1\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    "add x22, x7, x16, LSL #1\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    "add x21, x13, x3, LSL #1\n"
+    ".inst 0xf8aa4a3a  // rprfm pldonce, x10, [x17]\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "add x20, x13, x16, LSL #1\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    ".inst 0xf8aa49fa  // rprfm pldonce, x10, [x15]\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mov x21, #0x4\n"
+    "ld1h { z15.h }, p3/Z, [x5]\n"
+    "addvl x5, x5, #1\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "cnth x28\n"
+    ".inst 0xa040a0a0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x5]\n"
+    "addvl x5, x5, #4\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    ".inst 0xa040a0a4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x5]\n"
+    "addvl x5, x5, #4\n"
+    "mul x20, x1, x22\n"  // offset = tile_i * ld_output_row
+    "cmp x28, %x[n_channels]\n"
+    "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "madd x20, x2, x9, x20\n"  // offset += tile_j * ld_output_col
+    "add x26, x9, x9\n"
+    "ld1rh { z13.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "mul x20, x20, x21\n"  // offset *= output_tile_size
+    "add x25, x26, x9\n"
+    "ld1h { z8.h }, p3/Z, [x5]\n"
+    "add x27, x27, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "mov x21, #0x0\n"
+    "ld1h { z9.h }, p2/Z, [x17, x6, LSL #1]\n"
+    "add x24, x27, x22, LSL #1\n"
+    "sub x20, XZR, x28\n"
+    "ld1h { z10.h }, p2/Z, [x4]\n"
+    "add x23, x24, x22, LSL #1\n"
+    "ld1h { z11.h }, p2/Z, [x4, x14, LSL #1]\n"
+    "addvl x5, x5, #1\n"
+    "add x22, x23, x22, LSL #1\n"
+    "ld1h { z12.h }, p2/Z, [x17, x8, LSL #1]\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
+    "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
+    "whilelt p1.h, x28, %x[n_channels]\n"
+    "inch x21\n"
+    "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
+    "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
+    "inch x28\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
+    "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
+    "inch x20\n"
+    "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
+    "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
+    "fmla z21.h, p3/M, z5.h, z12.h\n"
+    "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x15, x6, LSL #1]\n"
+    "fmla z16.h, p3/M, z0.h, z10.h\n"
+    "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x12]\n"
+    "fmla z22.h, p3/M, z4.h, z12.h\n"
+    "fmla z25.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "fmla z17.h, p3/M, z8.h, z12.h\n"
+    "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
+    "fmla z21.h, p3/M, z7.h, z9.h\n"
+    "ld1h { z10.h }, p2/Z, [x15, x8, LSL #1]\n"
+    "fmla z18.h, p3/M, z7.h, z12.h\n"
+    "fmla z19.h, p3/M, z6.h, z12.h\n"
+    "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
+    "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x4, x3, LSL #1]\n"
+    "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
+    "fmla z22.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z11.h }, p2/Z, [x4, x16, LSL #1]\n"
+    "fmla z25.h, p3/M, z4.h, z9.h\n"
+    "fmla z26.h, p3/M, z3.h, z9.h\n"
+    "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
+    "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z15.h }, p3/Z, [x5]\n"
+    "addvl x5, x5, #1\n"
+    "fmla z20.h, p3/M, z8.h, z9.h\n"
+    "fmla z24.h, p3/M, z5.h, z9.h\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "fmla z21.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z9.h }, p2/Z, [x7]\n"
+    "fmla z16.h, p3/M, z1.h, z12.h\n"
+    "fmla z17.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x7, x14, LSL #1]\n"
+    "fmla z18.h, p3/M, z2.h, z11.h\n"
+    "fmla z19.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13]\n"
+    "fmla z22.h, p3/M, z7.h, z10.h\n"
+    "fmla z23.h, p3/M, z6.h, z10.h\n"
+    "fmla z25.h, p3/M, z5.h, z10.h\n"
+    "fmla z26.h, p3/M, z4.h, z10.h\n"
+    "fmla z27.h, p3/M, z3.h, z10.h\n"
+    "fmla z29.h, p3/M, z2.h, z10.h\n"
+    "fmla z30.h, p3/M, z1.h, z10.h\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x7, x6, LSL #1]\n"
+    "fmla z20.h, p3/M, z0.h, z9.h\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x14, LSL #1]\n"
+    "fmla z16.h, p3/M, z3.h, z9.h\n"
+    "fmla z21.h, p3/M, z1.h, z10.h\n"
+    "fmla z19.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z2.h, z12.h\n"
+    "fmla z17.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x7, x8, LSL #1]\n"
+    "fmla z18.h, p3/M, z3.h, z10.h\n"
+    "fmla z22.h, p3/M, z0.h, z10.h\n"
+    "fmla z27.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x3, LSL #1]\n"
+    "fmla z20.h, p3/M, z2.h, z10.h\n"
+    "fmla z21.h, p3/M, z2.h, z12.h\n"
+    "fmla z16.h, p3/M, z5.h, z10.h\n"
+    "fmla z17.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z10.h }, p2/Z, [x17, x3, LSL #1]\n"
+    "fmla z18.h, p3/M, z4.h, z12.h\n"
+    "fmla z19.h, p3/M, z3.h, z12.h\n"
+    "fmla z22.h, p3/M, z1.h, z12.h\n"
+    "fmla z23.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x17, x16, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z11.h\n"
+    "fmla z29.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x16, LSL #1]\n"
+    "fmla z20.h, p3/M, z4.h, z10.h\n"
+    "fmla z21.h, p3/M, z3.h, z10.h\n"
+    "fmla z24.h, p3/M, z1.h, z10.h\n"
+    "fmla z25.h, p3/M, z0.h, z10.h\n"
+    "fmla z16.h, p3/M, z7.h, z10.h\n"
+    "fmla z17.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x4, x6, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x15, x3, LSL #1]\n"
+    "fmla z18.h, p3/M, z8.h, z12.h\n"
+    "fmla z19.h, p3/M, z7.h, z12.h\n"
+    "fmla z22.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "fmla z26.h, p3/M, z2.h, z12.h\n"
+    "fmla z27.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x4, x8, LSL #1]\n"
+    "addvl x4, x4, #1\n"
+    "fmla z20.h, p3/M, z7.h, z11.h\n"
+    "fmla z21.h, p3/M, z6.h, z11.h\n"
+    "fmla z24.h, p3/M, z4.h, z11.h\n"
+    "fmla z25.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x15, x16, LSL #1]\n"
+    "fmla z16.h, p3/M, z2.h, z10.h\n"
+    "fmla z17.h, p3/M, z1.h, z10.h\n"
+    "fmla z18.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x17]\n"
+    "fmla z19.h, p3/M, z0.h, z12.h\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "fmla z22.h, p3/M, z8.h, z11.h\n"
+    "fmla z23.h, p3/M, z7.h, z11.h\n"
+    "fmla z26.h, p3/M, z5.h, z11.h\n"
+    "fmla z20.h, p3/M, z3.h, z10.h\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z27.h, p3/M, z4.h, z11.h\n"
+    "fmla z31.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x6, LSL #1]\n"
+    "fmla z17.h, p3/M, z2.h, z12.h\n"
+    "fmla z18.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x17, x14, LSL #1]\n"
+    "addvl x17, x17, #1\n"
+    "fmla z16.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x15]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "fmla z25.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z9.h }, p1/Z, [x17, x6, LSL #1]\n"
+    "fmla z19.h, p3/M, z8.h, z12.h\n"
+    "fmla z23.h, p3/M, z5.h, z12.h\n"
+    "fmla z27.h, p3/M, z2.h, z12.h\n"
+    "fmla z20.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x15, x14, LSL #1]\n"
+    "addvl x15, x15, #1\n"
+    "fmla z24.h, p3/M, z3.h, z10.h\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x12, x6, LSL #1]\n"
+    "fmla z26.h, p3/M, z6.h, z11.h\n"
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "fmla z23.h, p3/M, z8.h, z12.h\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "fmla z30.h, p3/M, z6.h, z10.h\n"
+    "fmla z27.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x12, x8, LSL #1]\n"
+    "addvl x12, x12, #1\n"
+    "fmla z24.h, p3/M, z8.h, z11.h\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x8, LSL #1]\n"
+    "fmla z29.h, p3/M, z5.h, z11.h\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "fmla z26.h, p3/M, z7.h, z11.h\n"
+    "fmla z27.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x7, x16, LSL #1]\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x7, x3, LSL #1]\n"
+    "addvl x7, x7, #1\n"
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "fmla z30.h, p3/M, z7.h, z12.h\n"
+    "fmla z31.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x3, LSL #1]\n"
+    "fmla z18.h, p3/M, z5.h, z11.h\n"
+    "fmla z19.h, p3/M, z4.h, z11.h\n"
+    "fmla z16.h, p3/M, z4.h, z10.h\n"
+    "fmla z17.h, p3/M, z3.h, z10.h\n"
+    "fmla z20.h, p3/M, z1.h, z10.h\n"
+    "fmla z21.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x16, LSL #1]\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "fmla z22.h, p3/M, z2.h, z11.h\n"
+    "fmla z23.h, p3/M, z1.h, z11.h\n"
+    "cmp x28, %x[n_channels]\n"
+    "addvl x13, x13, #1\n"
+    "fmla z24.h, p3/M, z7.h, z12.h\n"
+    "fmla z25.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z11.h }, p1/Z, [x4, x14, LSL #1]\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    ".inst 0xa040a0a0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x5]\n"
+    "addvl x5, x5, #4\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z12.h }, p1/Z, [x17, x8, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    ".inst 0xa040a0a4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x5]\n"
+    "addvl x5, x5, #4\n"
+    ".inst 0xc16dc9d0  // fclamp { z16.h-z19.h }, z14.h, z13.h\n"
+    ".inst 0xc16dc9d4  // fclamp { z20.h-z23.h }, z14.h, z13.h\n"
+    "ld1h { z10.h }, p1/Z, [x4]\n"
+    "ld1h { z8.h }, p3/Z, [x5]\n"
+    "addvl x5, x5, #1\n"
+    ".inst 0xc16dc9d8  // fclamp { z24.h-z27.h }, z14.h, z13.h\n"
+    ".inst 0xc16dc9dc  // fclamp { z28.h-z31.h }, z14.h, z13.h\n"
+    "st1h { z16.h }, p0, [x27]\n"
+    "st1h { z17.h }, p0, [x27, x9, LSL #1]\n"
+    "st1h { z18.h }, p0, [x27, x26, LSL #1]\n"
+    "st1h { z19.h }, p0, [x27, x25, LSL #1]\n"
+    "addvl x27, x27, #1\n"
+    "st1h { z20.h }, p0, [x24]\n"
+    "st1h { z21.h }, p0, [x24, x9, LSL #1]\n"
+    "st1h { z22.h }, p0, [x24, x26, LSL #1]\n"
+    "st1h { z23.h }, p0, [x24, x25, LSL #1]\n"
+    "addvl x24, x24, #1\n"
+    "st1h { z24.h }, p0, [x23]\n"
+    "st1h { z25.h }, p0, [x23, x9, LSL #1]\n"
+    "st1h { z26.h }, p0, [x23, x26, LSL #1]\n"
+    "st1h { z27.h }, p0, [x23, x25, LSL #1]\n"
+    "addvl x23, x23, #1\n"
+    "st1h { z28.h }, p0, [x22]\n"
+    "st1h { z29.h }, p0, [x22, x9, LSL #1]\n"
+    "st1h { z30.h }, p0, [x22, x26, LSL #1]\n"
+    "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+    "addvl x22, x22, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
+    "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
+    "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
+    "ldr x1, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
+    "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
+    "ldr x11, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
+    "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x2, x2, #0x1\n"
+    "fmla z21.h, p3/M, z5.h, z12.h\n"
+    "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x15, x6, LSL #1]\n"
+    "add x20, x1, #0x1\n"
+    "fmla z16.h, p3/M, z0.h, z10.h\n"
+    "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x12]\n"
+    "cmp x2, x11\n"
+    "fmla z22.h, p3/M, z4.h, z12.h\n"
+    "fmla z25.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
+    "csel x1, x1, x20, LT\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "fmla z17.h, p3/M, z8.h, z12.h\n"
+    "csel x2, x2, XZR, LT\n"
+    "cmp x1, x21\n"
+    "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
+    "fmla z21.h, p3/M, z7.h, z9.h\n"
+    "ld1h { z10.h }, p2/Z, [x15, x8, LSL #1]\n"
+    "fmla z18.h, p3/M, z7.h, z12.h\n"
+    "fmla z19.h, p3/M, z6.h, z12.h\n"
+    "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
+    "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x4, x3, LSL #1]\n"
+    "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
+    "fmla z22.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z11.h }, p2/Z, [x4, x16, LSL #1]\n"
+    "fmla z25.h, p3/M, z4.h, z9.h\n"
+    "fmla z26.h, p3/M, z3.h, z9.h\n"
+    "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
+    "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
+    "fmla z20.h, p3/M, z8.h, z9.h\n"
+    "fmla z24.h, p3/M, z5.h, z9.h\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "fmla z21.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z9.h }, p2/Z, [x7]\n"
+    "fmla z16.h, p3/M, z1.h, z12.h\n"
+    "fmla z17.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x7, x14, LSL #1]\n"
+    "fmla z18.h, p3/M, z2.h, z11.h\n"
+    "fmla z19.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13]\n"
+    "fmla z22.h, p3/M, z7.h, z10.h\n"
+    "fmla z23.h, p3/M, z6.h, z10.h\n"
+    "fmla z25.h, p3/M, z5.h, z10.h\n"
+    "fmla z26.h, p3/M, z4.h, z10.h\n"
+    "fmla z27.h, p3/M, z3.h, z10.h\n"
+    "fmla z29.h, p3/M, z2.h, z10.h\n"
+    "fmla z30.h, p3/M, z1.h, z10.h\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x7, x6, LSL #1]\n"
+    "fmla z20.h, p3/M, z0.h, z9.h\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x14, LSL #1]\n"
+    "fmla z16.h, p3/M, z3.h, z9.h\n"
+    "fmla z21.h, p3/M, z1.h, z10.h\n"
+    "fmla z19.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z2.h, z12.h\n"
+    "fmla z17.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x7, x8, LSL #1]\n"
+    "fmla z18.h, p3/M, z3.h, z10.h\n"
+    "fmla z22.h, p3/M, z0.h, z10.h\n"
+    "fmla z27.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x3, LSL #1]\n"
+    "fmla z20.h, p3/M, z2.h, z10.h\n"
+    "fmla z21.h, p3/M, z2.h, z12.h\n"
+    "fmla z16.h, p3/M, z5.h, z10.h\n"
+    "fmla z17.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z10.h }, p2/Z, [x17, x3, LSL #1]\n"
+    "fmla z18.h, p3/M, z4.h, z12.h\n"
+    "fmla z19.h, p3/M, z3.h, z12.h\n"
+    "fmla z22.h, p3/M, z1.h, z12.h\n"
+    "fmla z23.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x17, x16, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z11.h\n"
+    "fmla z29.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x16, LSL #1]\n"
+    "fmla z20.h, p3/M, z4.h, z10.h\n"
+    "fmla z21.h, p3/M, z3.h, z10.h\n"
+    "fmla z24.h, p3/M, z1.h, z10.h\n"
+    "fmla z25.h, p3/M, z0.h, z10.h\n"
+    "fmla z16.h, p3/M, z7.h, z10.h\n"
+    "fmla z17.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x4, x6, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x15, x3, LSL #1]\n"
+    "fmla z18.h, p3/M, z8.h, z12.h\n"
+    "fmla z19.h, p3/M, z7.h, z12.h\n"
+    "fmla z22.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "fmla z26.h, p3/M, z2.h, z12.h\n"
+    "fmla z27.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x4, x8, LSL #1]\n"
+    "fmla z20.h, p3/M, z7.h, z11.h\n"
+    "fmla z21.h, p3/M, z6.h, z11.h\n"
+    "fmla z24.h, p3/M, z4.h, z11.h\n"
+    "fmla z25.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x15, x16, LSL #1]\n"
+    "fmla z16.h, p3/M, z2.h, z10.h\n"
+    "fmla z17.h, p3/M, z1.h, z10.h\n"
+    "fmla z18.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x17]\n"
+    "fmla z19.h, p3/M, z0.h, z12.h\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "fmla z22.h, p3/M, z8.h, z11.h\n"
+    "fmla z23.h, p3/M, z7.h, z11.h\n"
+    "fmla z26.h, p3/M, z5.h, z11.h\n"
+    "fmla z20.h, p3/M, z3.h, z10.h\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z27.h, p3/M, z4.h, z11.h\n"
+    "fmla z31.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x6, LSL #1]\n"
+    "fmla z17.h, p3/M, z2.h, z12.h\n"
+    "fmla z18.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x17, x14, LSL #1]\n"
+    "fmla z16.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x15]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "fmla z25.h, p3/M, z7.h, z11.h\n"
+    "fmla z19.h, p3/M, z8.h, z12.h\n"
+    "fmla z23.h, p3/M, z5.h, z12.h\n"
+    "fmla z27.h, p3/M, z2.h, z12.h\n"
+    "fmla z20.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x15, x14, LSL #1]\n"
+    "fmla z24.h, p3/M, z3.h, z10.h\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x12, x6, LSL #1]\n"
+    "fmla z26.h, p3/M, z6.h, z11.h\n"
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "fmla z23.h, p3/M, z8.h, z12.h\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "fmla z30.h, p3/M, z6.h, z10.h\n"
+    "fmla z27.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x12, x8, LSL #1]\n"
+    "fmla z24.h, p3/M, z8.h, z11.h\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x8, LSL #1]\n"
+    "fmla z29.h, p3/M, z5.h, z11.h\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "fmla z26.h, p3/M, z7.h, z11.h\n"
+    "fmla z27.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x7, x16, LSL #1]\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x7, x3, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "fmla z30.h, p3/M, z7.h, z12.h\n"
+    "fmla z31.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x3, LSL #1]\n"
+    "fmla z18.h, p3/M, z5.h, z11.h\n"
+    "fmla z19.h, p3/M, z4.h, z11.h\n"
+    "fmla z16.h, p3/M, z4.h, z10.h\n"
+    "fmla z17.h, p3/M, z3.h, z10.h\n"
+    "fmla z20.h, p3/M, z1.h, z10.h\n"
+    "fmla z21.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x16, LSL #1]\n"
+    "fmla z22.h, p3/M, z2.h, z11.h\n"
+    "fmla z23.h, p3/M, z1.h, z11.h\n"
+    "fmla z24.h, p3/M, z7.h, z12.h\n"
+    "fmla z25.h, p3/M, z6.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    ".inst 0xc16dc9d0  // fclamp { z16.h-z19.h }, z14.h, z13.h\n"
+    ".inst 0xc16dc9d4  // fclamp { z20.h-z23.h }, z14.h, z13.h\n"
+    ".inst 0xc16dc9d8  // fclamp { z24.h-z27.h }, z14.h, z13.h\n"
+    ".inst 0xc16dc9dc  // fclamp { z28.h-z31.h }, z14.h, z13.h\n"
+    "st1h { z16.h }, p0, [x27]\n"
+    "st1h { z17.h }, p0, [x27, x9, LSL #1]\n"
+    "st1h { z18.h }, p0, [x27, x26, LSL #1]\n"
+    "st1h { z19.h }, p0, [x27, x25, LSL #1]\n"
+    "st1h { z20.h }, p0, [x24]\n"
+    "st1h { z21.h }, p0, [x24, x9, LSL #1]\n"
+    "st1h { z22.h }, p0, [x24, x26, LSL #1]\n"
+    "st1h { z23.h }, p0, [x24, x25, LSL #1]\n"
+    "st1h { z24.h }, p0, [x23]\n"
+    "st1h { z25.h }, p0, [x23, x9, LSL #1]\n"
+    "st1h { z26.h }, p0, [x23, x26, LSL #1]\n"
+    "st1h { z27.h }, p0, [x23, x25, LSL #1]\n"
+    "st1h { z28.h }, p0, [x22]\n"
+    "st1h { z29.h }, p0, [x22, x9, LSL #1]\n"
+    "st1h { z30.h }, p0, [x22, x26, LSL #1]\n"
+    "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..2e6f1123a4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[36];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[14];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[5];
+      inptrs[3] = input_ptrs[15];
+      inptrs[4] = input_ptrs[30];
+      inptrs[5] = input_ptrs[35];
+      inptrs[6] = input_ptrs[20];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[4];
+      inptrs[9] = input_ptrs[21];
+      inptrs[10] = input_ptrs[6];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[24];
+      inptrs[13] = input_ptrs[8];
+      inptrs[14] = input_ptrs[29];
+      inptrs[15] = input_ptrs[9];
+      inptrs[16] = input_ptrs[31];
+      inptrs[17] = input_ptrs[13];
+      inptrs[18] = input_ptrs[34];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[19];
+      inptrs[22] = input_ptrs[3];
+      inptrs[23] = input_ptrs[12];
+      inptrs[24] = input_ptrs[22];
+      inptrs[25] = input_ptrs[17];
+      inptrs[26] = input_ptrs[18];
+      inptrs[27] = input_ptrs[26];
+      inptrs[28] = input_ptrs[23];
+      inptrs[29] = input_ptrs[32];
+      inptrs[30] = input_ptrs[27];
+      inptrs[31] = input_ptrs[33];
+      inptrs[32] = input_ptrs[7];
+      inptrs[33] = input_ptrs[10];
+      inptrs[34] = input_ptrs[25];
+      inptrs[35] = input_ptrs[28];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "mov x15, #0x0\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "ldp x14, x13, [x16, #0x0]\n"
+    "ldp x12, x11, [x16, #0x10]\n"
+    "cnth x10\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1h { z14.h }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "cmp x10, %x[n_channels]\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    ".inst 0xa040a220  // ld1h { z0.h-z3.h }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "sub x28, XZR, x10\n"
+    ".inst 0xa040a224  // ld1h { z4.h-z7.h }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "ld1rh { z13.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z8.h }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "ld1h { z9.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z21, z14\n fmla z21.h, p3/M, z4.h, z9.h\n"
+    "movprfx z16, z14\n fmla z16.h, p3/M, z8.h, z9.h\n"
+    "ldr x27, [x16, #0x20]\n"
+    "inch x28\n"
+    "movprfx z22, z14\n fmla z22.h, p3/M, z3.h, z9.h\n"
+    "movprfx z25, z14\n fmla z25.h, p3/M, z1.h, z9.h\n"
+    "ldr x26, [x16, #0x30]\n"
+    "mov p1.b, p2.b\n"
+    "movprfx z26, z14\n fmla z26.h, p3/M, z0.h, z9.h\n"
+    "ldr x25, [x16, #0x28]\n"
+    "movprfx z17, z14\n fmla z17.h, p3/M, z7.h, z9.h\n"
+    "whilelt p0.h, x10, %x[n_channels]\n"
+    "movprfx z18, z14\n fmla z18.h, p3/M, z6.h, z9.h\n"
+    "movprfx z20, z14\n fmla z20.h, p3/M, z5.h, z9.h\n"
+    "ldr x24, [x16, #0x38]\n"
+    "fmla z21.h, p3/M, z5.h, z12.h\n"
+    "movprfx z24, z14\n fmla z24.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x14, [x16, #0x40]\n"
+    "fmla z16.h, p3/M, z0.h, z10.h\n"
+    "movprfx z19, z14\n fmla z19.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldr x13, [x16, #0x48]\n"
+    "fmla z22.h, p3/M, z4.h, z12.h\n"
+    "fmla z25.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x12, [x16, #0x50]\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "fmla z17.h, p3/M, z8.h, z12.h\n"
+    "ldr x27, [x16, #0x60]\n"
+    "fmla z18.h, p3/M, z7.h, z12.h\n"
+    "movprfx z28, z14\n fmla z28.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x25, [x16, #0x68]\n"
+    "fmla z21.h, p3/M, z7.h, z9.h\n"
+    "fmla z19.h, p3/M, z6.h, z12.h\n"
+    "ldr x11, [x16, #0x58]\n"
+    "movprfx z23, z14\n fmla z23.h, p3/M, z3.h, z12.h\n"
+    "movprfx z27, z14\n fmla z27.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ldr x26, [x16, #0x70]\n"
+    "movprfx z31, z14\n fmla z31.h, p3/M, z8.h, z11.h\n"
+    "fmla z22.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ldr x24, [x16, #0x78]\n"
+    "fmla z25.h, p3/M, z4.h, z9.h\n"
+    "fmla z26.h, p3/M, z3.h, z9.h\n"
+    "ldr x14, [x16, #0x80]\n"
+    "movprfx z29, z14\n fmla z29.h, p3/M, z1.h, z9.h\n"
+    "movprfx z30, z14\n fmla z30.h, p3/M, z0.h, z9.h\n"
+    "ldr x13, [x16, #0x88]\n"
+    "ld1h { z14.h }, p3/Z, [x17]\n"
+    "fmla z20.h, p3/M, z8.h, z9.h\n"
+    "fmla z24.h, p3/M, z5.h, z9.h\n"
+    "ldr x23, [x9, #0x0]\n"
+    "addvl x17, x17, #1\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "fmla z16.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z9.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldr x12, [x16, #0x90]\n"
+    "fmla z17.h, p3/M, z0.h, z12.h\n"
+    "fmla z18.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "ldr x11, [x16, #0x98]\n"
+    "fmla z21.h, p3/M, z8.h, z10.h\n"
+    "fmla z19.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0xa0]\n"
+    "fmla z22.h, p3/M, z7.h, z10.h\n"
+    "fmla z23.h, p3/M, z6.h, z10.h\n"
+    "ldr x22, [x9, #0x8]\n"
+    "fmla z25.h, p3/M, z5.h, z10.h\n"
+    "fmla z26.h, p3/M, z4.h, z10.h\n"
+    "ldr x21, [x9, #0x10]\n"
+    "fmla z27.h, p3/M, z3.h, z10.h\n"
+    "fmla z29.h, p3/M, z2.h, z10.h\n"
+    "ldr x20, [x9, #0x18]\n"
+    "fmla z30.h, p3/M, z1.h, z10.h\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla z16.h, p3/M, z3.h, z9.h\n"
+    "fmla z20.h, p3/M, z0.h, z9.h\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0xb0]\n"
+    "fmla z17.h, p3/M, z4.h, z10.h\n"
+    "fmla z18.h, p3/M, z3.h, z10.h\n"
+    "fmla z21.h, p3/M, z1.h, z10.h\n"
+    "fmla z19.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z2.h, z12.h\n"
+    "fmla z22.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ldr x24, [x16, #0xb8]\n"
+    "fmla z27.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ldr x14, [x16, #0xc0]\n"
+    "fmla z16.h, p3/M, z5.h, z10.h\n"
+    "fmla z20.h, p3/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x13, [x16, #0xc8]\n"
+    "fmla z17.h, p3/M, z5.h, z12.h\n"
+    "fmla z18.h, p3/M, z4.h, z12.h\n"
+    "fmla z21.h, p3/M, z2.h, z12.h\n"
+    "fmla z19.h, p3/M, z3.h, z12.h\n"
+    "fmla z22.h, p3/M, z1.h, z12.h\n"
+    "fmla z23.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "ldr x11, [x16, #0xd8]\n"
+    "fmla z28.h, p3/M, z7.h, z11.h\n"
+    "fmla z29.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldr x12, [x16, #0xd0]\n"
+    "fmla z16.h, p3/M, z7.h, z10.h\n"
+    "fmla z17.h, p3/M, z6.h, z10.h\n"
+    "fmla z20.h, p3/M, z4.h, z10.h\n"
+    "fmla z21.h, p3/M, z3.h, z10.h\n"
+    "fmla z24.h, p3/M, z1.h, z10.h\n"
+    "fmla z25.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0xe0]\n"
+    "fmla z18.h, p3/M, z8.h, z12.h\n"
+    "fmla z30.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z1.h, z12.h\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla z19.h, p3/M, z7.h, z12.h\n"
+    "fmla z22.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "fmla z26.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0xf0]\n"
+    "fmla z16.h, p3/M, z2.h, z10.h\n"
+    "fmla z17.h, p3/M, z1.h, z10.h\n"
+    "fmla z18.h, p3/M, z0.h, z10.h\n"
+    "fmla z20.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ldr x24, [x16, #0xf8]\n"
+    "fmla z21.h, p3/M, z6.h, z11.h\n"
+    "fmla z24.h, p3/M, z4.h, z11.h\n"
+    "fmla z25.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z17.h, p3/M, z2.h, z12.h\n"
+    "ldr x14, [x16, #0x100]\n"
+    "fmla z18.h, p3/M, z1.h, z12.h\n"
+    "fmla z19.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x13, [x16, #0x108]\n"
+    "fmla z16.h, p3/M, z6.h, z10.h\n"
+    "fmla z20.h, p3/M, z3.h, z10.h\n"
+    "fmla z27.h, p3/M, z4.h, z11.h\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z22.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldr x12, [x16, #0x110]\n"
+    "fmla z23.h, p3/M, z7.h, z11.h\n"
+    "fmla z26.h, p3/M, z5.h, z11.h\n"
+    "fmla z31.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z19.h, p3/M, z8.h, z12.h\n"
+    "ldr x11, [x16, #0x118]\n"
+    "fmla z27.h, p3/M, z2.h, z12.h\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "fmla z20.h, p3/M, z6.h, z10.h\n"
+    "fmla z24.h, p3/M, z3.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "fmla z23.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "fmla z25.h, p3/M, z7.h, z11.h\n"
+    "fmla z26.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "fmla z24.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z5.h, z12.h\n"
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "fmla z30.h, p3/M, z6.h, z10.h\n"
+    "fmla z23.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z7.h, z11.h\n"
+    "fmla z27.h, p3/M, z6.h, z11.h\n"
+    "fmla z29.h, p3/M, z5.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "fmla z16.h, p3/M, z4.h, z10.h\n"
+    "ldp x14, x13, [x16, #0x0]\n"
+    "fmla z17.h, p3/M, z3.h, z10.h\n"
+    "fmla z20.h, p3/M, z1.h, z10.h\n"
+    "fmla z21.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z18.h, p3/M, z5.h, z11.h\n"
+    "fmla z19.h, p3/M, z4.h, z11.h\n"
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z9.h }, p0/Z, [x14, x10, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z12.h\n"
+    "fmla z31.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldp x12, x11, [x16, #0x10]\n"
+    "fmla z22.h, p3/M, z2.h, z11.h\n"
+    "fmla z23.h, p3/M, z1.h, z11.h\n"
+    "inch x15\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "whilelt p2.h, x15, %x[n_channels]\n"
+    ".inst 0xc16dc9f0  // fclamp { z16.h-z19.h }, z15.h, z13.h\n"
+    "fmla z24.h, p3/M, z7.h, z12.h\n"
+    "ld1h { z11.h }, p0/Z, [x12, x10, LSL #1]\n"
+    "fmla z25.h, p3/M, z6.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    ".inst 0xc16dc9f4  // fclamp { z20.h-z23.h }, z15.h, z13.h\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z12.h }, p0/Z, [x11, x10, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p0/Z, [x13, x10, LSL #1]\n"
+    "inch x10\n"
+    "st1h { z16.h }, p1, [x23, x28, LSL #1]\n"
+    "ldr x23, [x9, #0x20]\n"
+    ".inst 0xa040a220  // ld1h { z0.h-z3.h }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "st1h { z17.h }, p1, [x22, x28, LSL #1]\n"
+    "ldr x22, [x9, #0x28]\n"
+    ".inst 0xc16dc9f8  // fclamp { z24.h-z27.h }, z15.h, z13.h\n"
+    ".inst 0xa040a224  // ld1h { z4.h-z7.h }, pn8.b/Z, [x17]\n"
+    "st1h { z18.h }, p1, [x21, x28, LSL #1]\n"
+    "ldr x21, [x9, #0x30]\n"
+    "addvl x17, x17, #4\n"
+    "cmp x10, %x[n_channels]\n"
+    "st1h { z19.h }, p1, [x20, x28, LSL #1]\n"
+    "ldr x20, [x9, #0x38]\n"
+    ".inst 0xc16dc9fc  // fclamp { z28.h-z31.h }, z15.h, z13.h\n"
+    "ld1h { z8.h }, p3/Z, [x17]\n"
+    "st1h { z20.h }, p1, [x23, x28, LSL #1]\n"
+    "ldr x23, [x9, #0x40]\n"
+    "addvl x17, x17, #1\n"
+    "st1h { z21.h }, p1, [x22, x28, LSL #1]\n"
+    "ldr x22, [x9, #0x48]\n"
+    "st1h { z22.h }, p1, [x21, x28, LSL #1]\n"
+    "ldr x21, [x9, #0x50]\n"
+    "st1h { z23.h }, p1, [x20, x28, LSL #1]\n"
+    "ldr x20, [x9, #0x58]\n"
+    "st1h { z24.h }, p1, [x23, x28, LSL #1]\n"
+    "ldr x23, [x9, #0x60]\n"
+    "st1h { z25.h }, p1, [x22, x28, LSL #1]\n"
+    "ldr x22, [x9, #0x68]\n"
+    "st1h { z26.h }, p1, [x21, x28, LSL #1]\n"
+    "ldr x21, [x9, #0x70]\n"
+    "st1h { z27.h }, p1, [x20, x28, LSL #1]\n"
+    "ldr x20, [x9, #0x78]\n"
+    "st1h { z28.h }, p1, [x23, x28, LSL #1]\n"
+    "st1h { z29.h }, p1, [x22, x28, LSL #1]\n"
+    "st1h { z30.h }, p1, [x21, x28, LSL #1]\n"
+    "st1h { z31.h }, p1, [x20, x28, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z21, z14\n fmla z21.h, p3/M, z4.h, z9.h\n"
+    "movprfx z16, z14\n fmla z16.h, p3/M, z8.h, z9.h\n"
+    "ldr x27, [x16, #0x20]\n"
+    "inch x28\n"
+    "movprfx z22, z14\n fmla z22.h, p3/M, z3.h, z9.h\n"
+    "movprfx z25, z14\n fmla z25.h, p3/M, z1.h, z9.h\n"
+    "ldr x26, [x16, #0x30]\n"
+    "mov p1.b, p2.b\n"
+    "movprfx z26, z14\n fmla z26.h, p3/M, z0.h, z9.h\n"
+    "ldr x25, [x16, #0x28]\n"
+    "movprfx z17, z14\n fmla z17.h, p3/M, z7.h, z9.h\n"
+    "movprfx z18, z14\n fmla z18.h, p3/M, z6.h, z9.h\n"
+    "movprfx z20, z14\n fmla z20.h, p3/M, z5.h, z9.h\n"
+    "ldr x24, [x16, #0x38]\n"
+    "fmla z21.h, p3/M, z5.h, z12.h\n"
+    "movprfx z24, z14\n fmla z24.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x14, [x16, #0x40]\n"
+    "fmla z16.h, p3/M, z0.h, z10.h\n"
+    "movprfx z19, z14\n fmla z19.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldr x13, [x16, #0x48]\n"
+    "fmla z22.h, p3/M, z4.h, z12.h\n"
+    "fmla z25.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x12, [x16, #0x50]\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "fmla z17.h, p3/M, z8.h, z12.h\n"
+    "ldr x27, [x16, #0x60]\n"
+    "fmla z18.h, p3/M, z7.h, z12.h\n"
+    "movprfx z28, z14\n fmla z28.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x25, [x16, #0x68]\n"
+    "fmla z21.h, p3/M, z7.h, z9.h\n"
+    "fmla z19.h, p3/M, z6.h, z12.h\n"
+    "ldr x11, [x16, #0x58]\n"
+    "movprfx z23, z14\n fmla z23.h, p3/M, z3.h, z12.h\n"
+    "movprfx z27, z14\n fmla z27.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ldr x26, [x16, #0x70]\n"
+    "movprfx z31, z14\n fmla z31.h, p3/M, z8.h, z11.h\n"
+    "fmla z22.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ldr x24, [x16, #0x78]\n"
+    "fmla z25.h, p3/M, z4.h, z9.h\n"
+    "fmla z26.h, p3/M, z3.h, z9.h\n"
+    "ldr x14, [x16, #0x80]\n"
+    "movprfx z29, z14\n fmla z29.h, p3/M, z1.h, z9.h\n"
+    "movprfx z30, z14\n fmla z30.h, p3/M, z0.h, z9.h\n"
+    "ldr x13, [x16, #0x88]\n"
+    "fmla z20.h, p3/M, z8.h, z9.h\n"
+    "fmla z24.h, p3/M, z5.h, z9.h\n"
+    "ldr x23, [x9, #0x0]\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "fmla z16.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z9.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldr x12, [x16, #0x90]\n"
+    "fmla z17.h, p3/M, z0.h, z12.h\n"
+    "fmla z18.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "ldr x11, [x16, #0x98]\n"
+    "fmla z21.h, p3/M, z8.h, z10.h\n"
+    "fmla z19.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0xa0]\n"
+    "fmla z22.h, p3/M, z7.h, z10.h\n"
+    "fmla z23.h, p3/M, z6.h, z10.h\n"
+    "ldr x22, [x9, #0x8]\n"
+    "fmla z25.h, p3/M, z5.h, z10.h\n"
+    "fmla z26.h, p3/M, z4.h, z10.h\n"
+    "ldr x21, [x9, #0x10]\n"
+    "fmla z27.h, p3/M, z3.h, z10.h\n"
+    "fmla z29.h, p3/M, z2.h, z10.h\n"
+    "ldr x20, [x9, #0x18]\n"
+    "fmla z30.h, p3/M, z1.h, z10.h\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla z16.h, p3/M, z3.h, z9.h\n"
+    "fmla z20.h, p3/M, z0.h, z9.h\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0xb0]\n"
+    "fmla z17.h, p3/M, z4.h, z10.h\n"
+    "fmla z18.h, p3/M, z3.h, z10.h\n"
+    "fmla z21.h, p3/M, z1.h, z10.h\n"
+    "fmla z19.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z2.h, z12.h\n"
+    "fmla z22.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ldr x24, [x16, #0xb8]\n"
+    "fmla z27.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ldr x14, [x16, #0xc0]\n"
+    "fmla z16.h, p3/M, z5.h, z10.h\n"
+    "fmla z20.h, p3/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x13, [x16, #0xc8]\n"
+    "fmla z17.h, p3/M, z5.h, z12.h\n"
+    "fmla z18.h, p3/M, z4.h, z12.h\n"
+    "fmla z21.h, p3/M, z2.h, z12.h\n"
+    "fmla z19.h, p3/M, z3.h, z12.h\n"
+    "fmla z22.h, p3/M, z1.h, z12.h\n"
+    "fmla z23.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "ldr x11, [x16, #0xd8]\n"
+    "fmla z28.h, p3/M, z7.h, z11.h\n"
+    "fmla z29.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldr x12, [x16, #0xd0]\n"
+    "fmla z16.h, p3/M, z7.h, z10.h\n"
+    "fmla z17.h, p3/M, z6.h, z10.h\n"
+    "fmla z20.h, p3/M, z4.h, z10.h\n"
+    "fmla z21.h, p3/M, z3.h, z10.h\n"
+    "fmla z24.h, p3/M, z1.h, z10.h\n"
+    "fmla z25.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0xe0]\n"
+    "fmla z18.h, p3/M, z8.h, z12.h\n"
+    "fmla z30.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z1.h, z12.h\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla z19.h, p3/M, z7.h, z12.h\n"
+    "fmla z22.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "fmla z26.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0xf0]\n"
+    "fmla z16.h, p3/M, z2.h, z10.h\n"
+    "fmla z17.h, p3/M, z1.h, z10.h\n"
+    "fmla z18.h, p3/M, z0.h, z10.h\n"
+    "fmla z20.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ldr x24, [x16, #0xf8]\n"
+    "fmla z21.h, p3/M, z6.h, z11.h\n"
+    "fmla z24.h, p3/M, z4.h, z11.h\n"
+    "fmla z25.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z17.h, p3/M, z2.h, z12.h\n"
+    "ldr x14, [x16, #0x100]\n"
+    "fmla z18.h, p3/M, z1.h, z12.h\n"
+    "fmla z19.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x13, [x16, #0x108]\n"
+    "fmla z16.h, p3/M, z6.h, z10.h\n"
+    "fmla z20.h, p3/M, z3.h, z10.h\n"
+    "fmla z27.h, p3/M, z4.h, z11.h\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z22.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldr x12, [x16, #0x110]\n"
+    "fmla z23.h, p3/M, z7.h, z11.h\n"
+    "fmla z26.h, p3/M, z5.h, z11.h\n"
+    "fmla z31.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z19.h, p3/M, z8.h, z12.h\n"
+    "ldr x11, [x16, #0x118]\n"
+    "fmla z27.h, p3/M, z2.h, z12.h\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "fmla z20.h, p3/M, z6.h, z10.h\n"
+    "fmla z24.h, p3/M, z3.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "fmla z23.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "fmla z25.h, p3/M, z7.h, z11.h\n"
+    "fmla z26.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "fmla z24.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z5.h, z12.h\n"
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "fmla z30.h, p3/M, z6.h, z10.h\n"
+    "fmla z23.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z7.h, z11.h\n"
+    "fmla z27.h, p3/M, z6.h, z11.h\n"
+    "fmla z29.h, p3/M, z5.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "fmla z16.h, p3/M, z4.h, z10.h\n"
+    "fmla z17.h, p3/M, z3.h, z10.h\n"
+    "fmla z20.h, p3/M, z1.h, z10.h\n"
+    "fmla z21.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z18.h, p3/M, z5.h, z11.h\n"
+    "fmla z19.h, p3/M, z4.h, z11.h\n"
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "fmla z30.h, p3/M, z7.h, z12.h\n"
+    "fmla z31.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "fmla z22.h, p3/M, z2.h, z11.h\n"
+    "fmla z23.h, p3/M, z1.h, z11.h\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    ".inst 0xc16dc9f0  // fclamp { z16.h-z19.h }, z15.h, z13.h\n"
+    "fmla z24.h, p3/M, z7.h, z12.h\n"
+    "fmla z25.h, p3/M, z6.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    ".inst 0xc16dc9f4  // fclamp { z20.h-z23.h }, z15.h, z13.h\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "st1h { z16.h }, p1, [x23, x28, LSL #1]\n"
+    "ldr x23, [x9, #0x20]\n"
+    "st1h { z17.h }, p1, [x22, x28, LSL #1]\n"
+    "ldr x22, [x9, #0x28]\n"
+    "st1h { z18.h }, p1, [x21, x28, LSL #1]\n"
+    "ldr x21, [x9, #0x30]\n"
+    ".inst 0xc16dc9f8  // fclamp { z24.h-z27.h }, z15.h, z13.h\n"
+    "st1h { z19.h }, p1, [x20, x28, LSL #1]\n"
+    "ldr x20, [x9, #0x38]\n"
+    "st1h { z20.h }, p1, [x23, x28, LSL #1]\n"
+    "ldr x23, [x9, #0x40]\n"
+    ".inst 0xc16dc9fc  // fclamp { z28.h-z31.h }, z15.h, z13.h\n"
+    "st1h { z21.h }, p1, [x22, x28, LSL #1]\n"
+    "ldr x22, [x9, #0x48]\n"
+    "st1h { z22.h }, p1, [x21, x28, LSL #1]\n"
+    "ldr x21, [x9, #0x50]\n"
+    "st1h { z23.h }, p1, [x20, x28, LSL #1]\n"
+    "ldr x20, [x9, #0x58]\n"
+    "st1h { z24.h }, p1, [x23, x28, LSL #1]\n"
+    "ldr x23, [x9, #0x60]\n"
+    "st1h { z25.h }, p1, [x22, x28, LSL #1]\n"
+    "ldr x22, [x9, #0x68]\n"
+    "st1h { z26.h }, p1, [x21, x28, LSL #1]\n"
+    "ldr x21, [x9, #0x70]\n"
+    "st1h { z27.h }, p1, [x20, x28, LSL #1]\n"
+    "ldr x20, [x9, #0x78]\n"
+    "st1h { z28.h }, p1, [x23, x28, LSL #1]\n"
+    "st1h { z29.h }, p1, [x22, x28, LSL #1]\n"
+    "st1h { z30.h }, p1, [x21, x28, LSL #1]\n"
+    "st1h { z31.h }, p1, [x20, x28, LSL #1]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..27fcb2e6d2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..066ce06aa6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x2, #0x0\n"
+    "mov x3, #0x0\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "1:"  // Tile loop
+    "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x4\n"
+    "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x20, x2, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x3, x4, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "add x7, x4, x4\n"
+    "add x5, x5, x20, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x8, x5, x21, LSL #1\n"
+    "add x17, x7, x4\n"
+    "add x16, x8, x21, LSL #1\n"
+    "add x15, x17, x4\n"
+    "add x14, x16, x21, LSL #1\n"
+    "add x13, x14, x21, LSL #1\n"
+    "cbnz x3, 2f\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "lsl x12, %x[n_channels], #0x1\n"
+    "mov x28, #0x8\n"
+    "mul x28, x28, x4\n"
+    "add x27, x16, x7, LSL #1\n"
+    "add x26, x5, x4, LSL #1\n"
+    "add x25, x5, x17, LSL #1\n"
+    "sub x20, x24, x3\n"
+    "add x24, x5, x15, LSL #1\n"
+    "sub x20, x20, #0x1\n"
+    "add x23, x8, x4, LSL #1\n"
+    "and x20, x20, #0x3fffff\n"
+    "add x22, x5, x7, LSL #1\n"
+    "orr x12, x12, x20, LSL #22\n"
+    "add x21, x8, x17, LSL #1\n"
+    "orr x12, x12, x28, LSL #38\n"
+    "add x20, x8, x15, LSL #1\n"
+    "add x11, x8, x7, LSL #1\n"
+    "add x10, x14, x4, LSL #1\n"
+    "add x9, x16, x4, LSL #1\n"
+    "add x28, x14, x17, LSL #1\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    "add x27, x16, x17, LSL #1\n"
+    ".inst 0xf8ac48ba  // rprfm pldonce, x12, [x5]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    "add x26, x14, x15, LSL #1\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    "add x25, x16, x15, LSL #1\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    "add x24, x13, x4, LSL #1\n"
+    ".inst 0xf8ac491a  // rprfm pldonce, x12, [x8]\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    "add x23, x14, x7, LSL #1\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    "add x22, x13, x17, LSL #1\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    "add x21, x13, x7, LSL #1\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "add x20, x13, x15, LSL #1\n"
+    ".inst 0xf8ac497a  // rprfm pldonce, x12, [x11]\n"
+    ".inst 0xf8ac49da  // rprfm pldonce, x12, [x14]\n"
+    ".inst 0xf8ac4a1a  // rprfm pldonce, x12, [x16]\n"
+    ".inst 0xf8ac495a  // rprfm pldonce, x12, [x10]\n"
+    ".inst 0xf8ac493a  // rprfm pldonce, x12, [x9]\n"
+    ".inst 0xf8ac4b9a  // rprfm pldonce, x12, [x28]\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    ".inst 0xf8ac49ba  // rprfm pldonce, x12, [x13]\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x26, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mov x20, #0x2\n"
+    "ld1h { z19.h }, p3/Z, [x6]\n"
+    "addvl x6, x6, #1\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "cnth x24\n"
+    ".inst 0xa040a0c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    ".inst 0xa040a0c4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "mul x22, x2, x26\n"  // offset = tile_i * ld_output_row
+    "cmp x24, %x[n_channels]\n"
+    "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "madd x22, x3, x25, x22\n"  // offset += tile_j * ld_output_col
+    "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "mov x21, #0x0\n"
+    "mul x22, x22, x20\n"  // offset *= output_tile_size
+    "sub x20, XZR, x24\n"
+    "ld1h { z8.h }, p3/Z, [x6]\n"
+    "add x23, x23, x22, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "ld1h { z9.h }, p2/Z, [x16, x7, LSL #1]\n"
+    "addvl x6, x6, #1\n"
+    "add x22, x23, x26, LSL #1\n"
+    "ld1h { z10.h }, p2/Z, [x5]\n"
+    "ld1h { z11.h }, p2/Z, [x5, x4, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x5, x17, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x5, x15, LSL #1]\n"
+    "ld1h { z14.h }, p2/Z, [x8]\n"
+    "ld1h { z15.h }, p2/Z, [x8, x4, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x5, x7, LSL #1]\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
+    "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
+    "whilelt p1.h, x24, %x[n_channels]\n"
+    "inch x21\n"
+    "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
+    "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z19.h }, p3/Z, [x6]\n"
+    "addvl x6, x6, #1\n"
+    "inch x24\n"
+    "mov p0.b, p2.b\n"
+    "addvl x5, x5, #1\n"
+    "inch x20\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x8, x15, LSL #1]\n"
+    "ld1h { z10.h }, p1/Z, [x5]\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x8, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x8, x7, LSL #1]\n"
+    "addvl x8, x8, #1\n"
+    "fmla z28.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x14]\n"
+    "fmla z29.h, p3/M, z0.h, z16.h\n"
+    "fmla z28.h, p3/M, z4.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x16]\n"
+    "fmla z30.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x4, LSL #1]\n"
+    "fmla z28.h, p3/M, z2.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x16, x4, LSL #1]\n"
+    "fmla z30.h, p3/M, z0.h, z15.h\n"
+    "fmla z29.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x16, x17, LSL #1]\n"
+    "fmla z28.h, p3/M, z5.h, z13.h\n"
+    "fmla z29.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16, x15, LSL #1]\n"
+    "addvl x16, x16, #1\n"
+    "ld1h { z9.h }, p1/Z, [x16, x7, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "fmla z28.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x13]\n"
+    "fmla z29.h, p3/M, z7.h, z12.h\n"
+    "fmla z30.h, p3/M, z1.h, z16.h\n"
+    "ld1h { z13.h }, p2/Z, [x13, x4, LSL #1]\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p1/Z, [x5, x17, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x14, x7, LSL #1]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z30.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x13, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z5.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x13, x17, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p1/Z, [x5, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "cmp x24, %x[n_channels]\n"
+    "addvl x13, x13, #1\n"
+    "fmla z30.h, p3/M, z5.h, z16.h\n"
+    "fmla z31.h, p3/M, z3.h, z16.h\n"
+    ".inst 0xa040a0c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "ld1h { z16.h }, p1/Z, [x5, x7, LSL #1]\n"
+    "fmla z31.h, p3/M, z7.h, z14.h\n"
+    "ld1h { z14.h }, p1/Z, [x8]\n"
+    "fmla z30.h, p3/M, z8.h, z15.h\n"
+    "fmla z31.h, p3/M, z6.h, z15.h\n"
+    ".inst 0xa040a0c4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "ld1h { z15.h }, p1/Z, [x8, x4, LSL #1]\n"
+    "fmla z31.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p1/Z, [x5, x4, LSL #1]\n"
+    "ld1h { z8.h }, p3/Z, [x6]\n"
+    "addvl x6, x6, #1\n"
+    ".inst 0xc171ca5c  // fclamp { z28.h-z31.h }, z18.h, z17.h\n"
+    "st1h { z28.h }, p0, [x23]\n"
+    "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+    "addvl x23, x23, #1\n"
+    "st1h { z30.h }, p0, [x22]\n"
+    "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+    "addvl x22, x22, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
+    "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
+    "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x3, x3, #0x1\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x8, x15, LSL #1]\n"
+    "add x20, x2, #0x1\n"
+    "cmp x3, x24\n"
+    "csel x2, x2, x20, LT\n"
+    "csel x3, x3, XZR, LT\n"
+    "cmp x2, x21\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x8, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x8, x7, LSL #1]\n"
+    "fmla z28.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x14]\n"
+    "fmla z29.h, p3/M, z0.h, z16.h\n"
+    "fmla z28.h, p3/M, z4.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x16]\n"
+    "fmla z30.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x4, LSL #1]\n"
+    "fmla z28.h, p3/M, z2.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x16, x4, LSL #1]\n"
+    "fmla z30.h, p3/M, z0.h, z15.h\n"
+    "fmla z29.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x16, x17, LSL #1]\n"
+    "fmla z28.h, p3/M, z5.h, z13.h\n"
+    "fmla z29.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x13, x4, LSL #1]\n"
+    "fmla z28.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x13]\n"
+    "fmla z29.h, p3/M, z7.h, z12.h\n"
+    "fmla z30.h, p3/M, z1.h, z16.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z28.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x14, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x13, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z5.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x13, x17, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z13.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z16.h\n"
+    "fmla z31.h, p3/M, z3.h, z16.h\n"
+    "fmla z30.h, p3/M, z8.h, z15.h\n"
+    "fmla z31.h, p3/M, z7.h, z14.h\n"
+    "fmla z31.h, p3/M, z6.h, z15.h\n"
+    "fmla z31.h, p3/M, z8.h, z11.h\n"
+    ".inst 0xc171ca5c  // fclamp { z28.h-z31.h }, z18.h, z17.h\n"
+    "st1h { z28.h }, p0, [x23]\n"
+    "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+    "st1h { z30.h }, p0, [x22]\n"
+    "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..1bf3a84959
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[25];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[1];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[2];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[7];
+      inptrs[11] = input_ptrs[15];
+      inptrs[12] = input_ptrs[10];
+      inptrs[13] = input_ptrs[16];
+      inptrs[14] = input_ptrs[11];
+      inptrs[15] = input_ptrs[18];
+      inptrs[16] = input_ptrs[13];
+      inptrs[17] = input_ptrs[19];
+      inptrs[18] = input_ptrs[20];
+      inptrs[19] = input_ptrs[14];
+      inptrs[20] = input_ptrs[21];
+      inptrs[21] = input_ptrs[17];
+      inptrs[22] = input_ptrs[23];
+      inptrs[23] = input_ptrs[22];
+      inptrs[24] = input_ptrs[24];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "mov x15, #0x0\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "cnth x13\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ldp x12, x11, [x20, #0x0]\n"
+    "ldp x10, x9, [x20, #0x10]\n"
+    "cmp x13, %x[n_channels]\n"
+    "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "sub x28, XZR, x13\n"
+    "ld1h { z17.h }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "ldp x27, x26, [x16, #0x0]\n"
+    "ldp x25, x24, [x16, #0x10]\n"
+    ".inst 0xa040a1c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "ldp x23, x22, [x16, #0x20]\n"
+    ".inst 0xa040a1c4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "ldp x21, x20, [x16, #0x30]\n"
+    "ld1h { z8.h }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "ld1h { z9.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "ld1h { z14.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ld1h { z15.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z28, z17\n fmla z28.h, p3/M, z8.h, z9.h\n"
+    "movprfx z29, z17\n fmla z29.h, p3/M, z6.h, z9.h\n"
+    "ldr x27, [x16, #0x40]\n"
+    "whilelt p1.h, x13, %x[n_channels]\n"
+    "ldr x26, [x16, #0x48]\n"
+    "movprfx z30, z17\n fmla z30.h, p3/M, z2.h, z9.h\n"
+    "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z17.h }, p3/Z, [x14]\n"
+    "ldr x25, [x16, #0x50]\n"
+    "addvl x14, x14, #1\n"
+    "inch x28\n"
+    "ldr x24, [x16, #0x58]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x20, [x16, #0x78]\n"
+    "ldr x23, [x16, #0x60]\n"
+    "ldr x22, [x16, #0x68]\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x27, [x16, #0x80]\n"
+    "ldr x26, [x16, #0x88]\n"
+    "ldr x21, [x16, #0x70]\n"
+    "fmla z28.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z16.h\n"
+    "ldr x24, [x16, #0x98]\n"
+    "ldr x25, [x16, #0x90]\n"
+    "fmla z30.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z4.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "ldr x23, [x16, #0xa0]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ldr x22, [x16, #0xa8]\n"
+    "fmla z28.h, p3/M, z2.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "ldr x21, [x16, #0xb0]\n"
+    "fmla z30.h, p3/M, z0.h, z15.h\n"
+    "fmla z29.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0xc0]\n"
+    "fmla z28.h, p3/M, z5.h, z13.h\n"
+    "fmla z29.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z7.h, z12.h\n"
+    "fmla z30.h, p3/M, z1.h, z16.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z28.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z5.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z13.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldp x27, x26, [x16, #0x0]\n"
+    "inch x15\n"
+    "ldp x25, x24, [x16, #0x10]\n"
+    "whilelt p2.h, x15, %x[n_channels]\n"
+    "ldp x23, x22, [x16, #0x20]\n"
+    "fmla z30.h, p3/M, z5.h, z16.h\n"
+    "ldp x21, x20, [x16, #0x30]\n"
+    "ld1h { z9.h }, p1/Z, [x27, x13, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z10.h }, p1/Z, [x26, x13, LSL #1]\n"
+    "ld1h { z12.h }, p1/Z, [x24, x13, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z15.h\n"
+    "ld1h { z13.h }, p1/Z, [x23, x13, LSL #1]\n"
+    "fmla z31.h, p3/M, z7.h, z14.h\n"
+    "ld1h { z14.h }, p1/Z, [x22, x13, LSL #1]\n"
+    "ld1h { z16.h }, p1/Z, [x20, x13, LSL #1]\n"
+    ".inst 0xa040a1c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "fmla z31.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p1/Z, [x21, x13, LSL #1]\n"
+    ".inst 0xa040a1c4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "fmla z31.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p1/Z, [x25, x13, LSL #1]\n"
+    "inch x13\n"
+    "cmp x13, %x[n_channels]\n"
+    "ld1h { z8.h }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    ".inst 0xc172ca7c  // fclamp { z28.h-z31.h }, z19.h, z18.h\n"
+    "st1h { z28.h }, p0, [x12, x28, LSL #1]\n"
+    "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
+    "st1h { z30.h }, p0, [x10, x28, LSL #1]\n"
+    "st1h { z31.h }, p0, [x9, x28, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z28, z17\n fmla z28.h, p3/M, z8.h, z9.h\n"
+    "movprfx z29, z17\n fmla z29.h, p3/M, z6.h, z9.h\n"
+    "ldr x27, [x16, #0x40]\n"
+    "inch x28\n"
+    "ldr x26, [x16, #0x48]\n"
+    "movprfx z30, z17\n fmla z30.h, p3/M, z2.h, z9.h\n"
+    "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "mov p0.b, p2.b\n"
+    "ldr x25, [x16, #0x50]\n"
+    "ldr x24, [x16, #0x58]\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x20, [x16, #0x78]\n"
+    "ldr x23, [x16, #0x60]\n"
+    "ldr x22, [x16, #0x68]\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x27, [x16, #0x80]\n"
+    "ldr x26, [x16, #0x88]\n"
+    "ldr x21, [x16, #0x70]\n"
+    "fmla z28.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z16.h\n"
+    "ldr x24, [x16, #0x98]\n"
+    "ldr x25, [x16, #0x90]\n"
+    "fmla z30.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z4.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "ldr x23, [x16, #0xa0]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ldr x22, [x16, #0xa8]\n"
+    "fmla z28.h, p3/M, z2.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "ldr x21, [x16, #0xb0]\n"
+    "fmla z30.h, p3/M, z0.h, z15.h\n"
+    "fmla z29.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0xc0]\n"
+    "fmla z28.h, p3/M, z5.h, z13.h\n"
+    "fmla z29.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z7.h, z12.h\n"
+    "fmla z30.h, p3/M, z1.h, z16.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z28.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z5.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z13.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z16.h\n"
+    "fmla z31.h, p3/M, z3.h, z16.h\n"
+    "fmla z30.h, p3/M, z8.h, z15.h\n"
+    "fmla z31.h, p3/M, z7.h, z14.h\n"
+    "fmla z31.h, p3/M, z6.h, z15.h\n"
+    "fmla z31.h, p3/M, z8.h, z11.h\n"
+    ".inst 0xc172ca7c  // fclamp { z28.h-z31.h }, z19.h, z18.h\n"
+    "st1h { z28.h }, p0, [x12, x28, LSL #1]\n"
+    "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
+    "st1h { z30.h }, p0, [x10, x28, LSL #1]\n"
+    "st1h { z31.h }, p0, [x9, x28, LSL #1]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..84263cb564
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..58b7824b98
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x2, #0x0\n"
+    "mov x3, #0x0\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "1:"  // Tile loop
+    "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x2\n"
+    "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x20, x2, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x3, x4, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "add x7, x4, x4\n"
+    "add x5, x5, x20, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x8, x5, x21, LSL #1\n"
+    "add x17, x7, x4\n"
+    "add x16, x8, x21, LSL #1\n"
+    "add x15, x17, x4\n"
+    "add x14, x16, x21, LSL #1\n"
+    "add x13, x15, x4\n"
+    "add x12, x14, x21, LSL #1\n"
+    "add x11, x12, x21, LSL #1\n"
+    "cbnz x3, 2f\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "lsl x10, %x[n_channels], #0x1\n"
+    "mov x21, #0x4\n"
+    "mul x21, x21, x4\n"
+    "add x9, x5, x4, LSL #1\n"
+    "add x28, x8, x4, LSL #1\n"
+    "add x27, x5, x7, LSL #1\n"
+    "sub x20, x25, x3\n"
+    "add x26, x8, x7, LSL #1\n"
+    "sub x20, x20, #0x1\n"
+    "add x25, x5, x17, LSL #1\n"
+    "and x20, x20, #0x3fffff\n"
+    "add x24, x5, x15, LSL #1\n"
+    "orr x10, x10, x20, LSL #22\n"
+    "add x23, x8, x13, LSL #1\n"
+    "orr x10, x10, x21, LSL #38\n"
+    "add x22, x8, x17, LSL #1\n"
+    "add x21, x8, x15, LSL #1\n"
+    "add x20, x5, x13, LSL #1\n"
+    ".inst 0xf8aa48ba  // rprfm pldonce, x10, [x5]\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    "add x9, x16, x4, LSL #1\n"
+    ".inst 0xf8aa491a  // rprfm pldonce, x10, [x8]\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    "add x28, x16, x7, LSL #1\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    "add x27, x16, x17, LSL #1\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    "add x26, x16, x15, LSL #1\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    "add x25, x16, x13, LSL #1\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    "add x24, x14, x4, LSL #1\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    "add x23, x14, x7, LSL #1\n"
+    ".inst 0xf8aa4a1a  // rprfm pldonce, x10, [x16]\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    "add x22, x14, x17, LSL #1\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    "add x21, x14, x15, LSL #1\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "add x20, x14, x13, LSL #1\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    "add x9, x12, x4, LSL #1\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    "add x28, x12, x7, LSL #1\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    "add x27, x12, x17, LSL #1\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    "add x26, x12, x15, LSL #1\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    "add x25, x12, x13, LSL #1\n"
+    ".inst 0xf8aa49da  // rprfm pldonce, x10, [x14]\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    "add x24, x11, x4, LSL #1\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    "add x23, x11, x7, LSL #1\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    "add x22, x11, x17, LSL #1\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    "add x21, x11, x15, LSL #1\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "add x20, x11, x13, LSL #1\n"
+    ".inst 0xf8aa499a  // rprfm pldonce, x10, [x12]\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    ".inst 0xf8aa497a  // rprfm pldonce, x10, [x11]\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x27, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mov x26, #0x2\n"
+    "cnth x25\n"
+    "ld1h { z18.h }, p3/Z, [x6]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "addvl x6, x6, #1\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    ".inst 0xa040a0c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "cmp x25, %x[n_channels]\n"
+    "mul x22, x2, x27\n"  // offset = tile_i * ld_output_row
+    "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "mov x21, #0x0\n"
+    "madd x22, x3, x24, x22\n"  // offset += tile_j * ld_output_col
+    "sub x20, XZR, x25\n"
+    "ld1h { z4.h }, p3/Z, [x6]\n"
+    "mul x22, x22, x26\n"  // offset *= output_tile_size
+    "ld1h { z5.h }, p2/Z, [x5]\n"
+    "addvl x6, x6, #1\n"
+    "add x23, x23, x22, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "ld1h { z6.h }, p2/Z, [x5, x4, LSL #1]\n"
+    "add x22, x23, x27, LSL #1\n"
+    "ld1h { z7.h }, p2/Z, [x8]\n"
+    "ld1h { z8.h }, p2/Z, [x8, x4, LSL #1]\n"
+    "ld1h { z9.h }, p2/Z, [x5, x7, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x8, x7, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x5, x17, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x5, x15, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x8, x13, LSL #1]\n"
+    "ld1h { z14.h }, p2/Z, [x16]\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z28, z18\n fmla z28.h, p3/M, z0.h, z5.h\n"
+    "movprfx z29, z18\n fmla z29.h, p3/M, z0.h, z6.h\n"
+    "ld1h { z5.h }, p2/Z, [x8, x17, LSL #1]\n"
+    "whilelt p1.h, x25, %x[n_channels]\n"
+    "movprfx z30, z18\n fmla z30.h, p3/M, z0.h, z7.h\n"
+    "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z8.h\n"
+    "ld1h { z0.h }, p3/Z, [x6]\n"
+    "inch x21\n"
+    "inch x25\n"
+    "mov p0.b, p2.b\n"
+    "inch x20\n"
+    "fmla z28.h, p3/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x8, x15, LSL #1]\n"
+    "addvl x8, x8, #1\n"
+    "fmla z29.h, p3/M, z1.h, z9.h\n"
+    "fmla z30.h, p3/M, z1.h, z8.h\n"
+    "fmla z31.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z1.h }, p3/Z, [x6, #1, MUL VL]\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x5, x13, LSL #1]\n"
+    "addvl x5, x5, #1\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "fmla z30.h, p3/M, z2.h, z13.h\n"
+    "fmla z31.h, p3/M, z2.h, z5.h\n"
+    "ld1h { z2.h }, p3/Z, [x6, #2, MUL VL]\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z30.h, p3/M, z3.h, z5.h\n"
+    "fmla z31.h, p3/M, z3.h, z6.h\n"
+    "ld1h { z3.h }, p3/Z, [x6, #3, MUL VL]\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x16, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x16, x17, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z6.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x6, #4, MUL VL]\n"
+    "fmla z28.h, p3/M, z0.h, z7.h\n"
+    "ld1h { z7.h }, p1/Z, [x8]\n"
+    "fmla z29.h, p3/M, z0.h, z8.h\n"
+    "fmla z30.h, p3/M, z0.h, z14.h\n"
+    "fmla z31.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z0.h }, p3/Z, [x6, #5, MUL VL]\n"
+    "fmla z28.h, p3/M, z1.h, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x16, x13, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z13.h\n"
+    "fmla z30.h, p3/M, z1.h, z11.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z1.h }, p3/Z, [x6, #6, MUL VL]\n"
+    "fmla z28.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x16, x15, LSL #1]\n"
+    "addvl x16, x16, #1\n"
+    "fmla z29.h, p3/M, z2.h, z5.h\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "fmla z31.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z2.h }, p3/Z, [x6, #7, MUL VL]\n"
+    "addvl x6, x6, #16\n"
+    "ld1h { z18.h }, p3/Z, [x6, #4, MUL VL]\n"
+    "fmla z28.h, p3/M, z3.h, z5.h\n"
+    "ld1h { z5.h }, p2/Z, [x14]\n"
+    "fmla z29.h, p3/M, z3.h, z6.h\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "fmla z31.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z3.h }, p3/Z, [x6, #-8, MUL VL]\n"
+    "fmla z28.h, p3/M, z4.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x14, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z13.h\n"
+    "fmla z31.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z4.h }, p3/Z, [x6, #-7, MUL VL]\n"
+    "fmla z28.h, p3/M, z0.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x14, x13, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "fmla z30.h, p3/M, z0.h, z5.h\n"
+    "fmla z31.h, p3/M, z0.h, z6.h\n"
+    "ld1h { z0.h }, p3/Z, [x6, #-6, MUL VL]\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "fmla z30.h, p3/M, z1.h, z6.h\n"
+    "fmla z31.h, p3/M, z1.h, z10.h\n"
+    "ld1h { z1.h }, p3/Z, [x6, #-5, MUL VL]\n"
+    "fmla z28.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z29.h, p3/M, z2.h, z9.h\n"
+    "fmla z30.h, p3/M, z2.h, z10.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z2.h }, p3/Z, [x6, #-4, MUL VL]\n"
+    "fmla z28.h, p3/M, z3.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x12]\n"
+    "fmla z29.h, p3/M, z3.h, z13.h\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z3.h }, p3/Z, [x6, #-3, MUL VL]\n"
+    "fmla z28.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x12, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z12.h\n"
+    "fmla z31.h, p3/M, z4.h, z14.h\n"
+    "ld1h { z4.h }, p3/Z, [x6, #-2, MUL VL]\n"
+    "fmla z28.h, p3/M, z0.h, z5.h\n"
+    "ld1h { z5.h }, p2/Z, [x12, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z6.h\n"
+    "fmla z30.h, p3/M, z0.h, z9.h\n"
+    "fmla z31.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z0.h }, p3/Z, [x6, #-1, MUL VL]\n"
+    "fmla z28.h, p3/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x12, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z10.h\n"
+    "fmla z30.h, p3/M, z1.h, z13.h\n"
+    "fmla z31.h, p3/M, z1.h, z5.h\n"
+    "ld1h { z1.h }, p3/Z, [x6]\n"
+    "fmla z28.h, p3/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x12, x13, LSL #1]\n"
+    "addvl x12, x12, #1\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "fmla z30.h, p3/M, z2.h, z5.h\n"
+    "fmla z31.h, p3/M, z2.h, z6.h\n"
+    "ld1h { z2.h }, p3/Z, [x6, #1, MUL VL]\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11]\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z30.h, p3/M, z3.h, z6.h\n"
+    "fmla z31.h, p3/M, z3.h, z8.h\n"
+    "ld1h { z3.h }, p3/Z, [x6, #2, MUL VL]\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z14.h\n"
+    "ld1h { z14.h }, p1/Z, [x16]\n"
+    "fmla z30.h, p3/M, z4.h, z8.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x6, #3, MUL VL]\n"
+    "addvl x6, x6, #5\n"
+    "fmla z28.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z13.h\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z0.h, z12.h\n"
+    "fmla z28.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z13.h }, p1/Z, [x8, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z5.h\n"
+    "fmla z30.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z1.h, z9.h\n"
+    "fmla z28.h, p3/M, z2.h, z5.h\n"
+    "ld1h { z5.h }, p1/Z, [x5]\n"
+    "fmla z29.h, p3/M, z2.h, z6.h\n"
+    "fmla z30.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x11, x13, LSL #1]\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "cmp x25, %x[n_channels]\n"
+    "addvl x11, x11, #1\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z6.h\n"
+    "ld1h { z6.h }, p1/Z, [x5, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z8.h\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p1/Z, [x5, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z12.h\n"
+    ".inst 0xa040a0c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "fmla z28.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p1/Z, [x8, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p1/Z, [x8, x13, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p1/Z, [x5, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p1/Z, [x5, x7, LSL #1]\n"
+    "ld1h { z4.h }, p3/Z, [x6]\n"
+    "addvl x6, x6, #1\n"
+    ".inst 0xc170ca3c  // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+    "st1h { z28.h }, p0, [x23]\n"
+    "st1h { z29.h }, p0, [x23, x24, LSL #1]\n"
+    "addvl x23, x23, #1\n"
+    "st1h { z30.h }, p0, [x22]\n"
+    "st1h { z31.h }, p0, [x22, x24, LSL #1]\n"
+    "addvl x22, x22, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z28, z18\n fmla z28.h, p3/M, z0.h, z5.h\n"
+    "movprfx z29, z18\n fmla z29.h, p3/M, z0.h, z6.h\n"
+    "ld1h { z5.h }, p2/Z, [x8, x17, LSL #1]\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "movprfx z30, z18\n fmla z30.h, p3/M, z0.h, z7.h\n"
+    "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z8.h\n"
+    "ld1h { z0.h }, p3/Z, [x6]\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "mov p0.b, p2.b\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x3, x3, #0x1\n"
+    "fmla z28.h, p3/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x8, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z9.h\n"
+    "add x20, x2, #0x1\n"
+    "fmla z30.h, p3/M, z1.h, z8.h\n"
+    "fmla z31.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z1.h }, p3/Z, [x6, #1, MUL VL]\n"
+    "cmp x3, x25\n"
+    "csel x2, x2, x20, LT\n"
+    "csel x3, x3, XZR, LT\n"
+    "cmp x2, x21\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x5, x13, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "fmla z30.h, p3/M, z2.h, z13.h\n"
+    "fmla z31.h, p3/M, z2.h, z5.h\n"
+    "ld1h { z2.h }, p3/Z, [x6, #2, MUL VL]\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z30.h, p3/M, z3.h, z5.h\n"
+    "fmla z31.h, p3/M, z3.h, z6.h\n"
+    "ld1h { z3.h }, p3/Z, [x6, #3, MUL VL]\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x16, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x16, x17, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z6.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x6, #4, MUL VL]\n"
+    "fmla z28.h, p3/M, z0.h, z7.h\n"
+    "fmla z29.h, p3/M, z0.h, z8.h\n"
+    "fmla z30.h, p3/M, z0.h, z14.h\n"
+    "fmla z31.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z0.h }, p3/Z, [x6, #5, MUL VL]\n"
+    "fmla z28.h, p3/M, z1.h, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x16, x13, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z13.h\n"
+    "fmla z30.h, p3/M, z1.h, z11.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z1.h }, p3/Z, [x6, #6, MUL VL]\n"
+    "fmla z28.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x16, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z5.h\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "fmla z31.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z2.h }, p3/Z, [x6, #7, MUL VL]\n"
+    "addvl x6, x6, #16\n"
+    "fmla z28.h, p3/M, z3.h, z5.h\n"
+    "ld1h { z5.h }, p2/Z, [x14]\n"
+    "fmla z29.h, p3/M, z3.h, z6.h\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "fmla z31.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z3.h }, p3/Z, [x6, #-8, MUL VL]\n"
+    "fmla z28.h, p3/M, z4.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x14, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z13.h\n"
+    "fmla z31.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z4.h }, p3/Z, [x6, #-7, MUL VL]\n"
+    "fmla z28.h, p3/M, z0.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x14, x13, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "fmla z30.h, p3/M, z0.h, z5.h\n"
+    "fmla z31.h, p3/M, z0.h, z6.h\n"
+    "ld1h { z0.h }, p3/Z, [x6, #-6, MUL VL]\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "fmla z30.h, p3/M, z1.h, z6.h\n"
+    "fmla z31.h, p3/M, z1.h, z10.h\n"
+    "ld1h { z1.h }, p3/Z, [x6, #-5, MUL VL]\n"
+    "fmla z28.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z9.h\n"
+    "fmla z30.h, p3/M, z2.h, z10.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z2.h }, p3/Z, [x6, #-4, MUL VL]\n"
+    "fmla z28.h, p3/M, z3.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x12]\n"
+    "fmla z29.h, p3/M, z3.h, z13.h\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z3.h }, p3/Z, [x6, #-3, MUL VL]\n"
+    "fmla z28.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x12, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z12.h\n"
+    "fmla z31.h, p3/M, z4.h, z14.h\n"
+    "ld1h { z4.h }, p3/Z, [x6, #-2, MUL VL]\n"
+    "fmla z28.h, p3/M, z0.h, z5.h\n"
+    "ld1h { z5.h }, p2/Z, [x12, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z6.h\n"
+    "fmla z30.h, p3/M, z0.h, z9.h\n"
+    "fmla z31.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z0.h }, p3/Z, [x6, #-1, MUL VL]\n"
+    "fmla z28.h, p3/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x12, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z10.h\n"
+    "fmla z30.h, p3/M, z1.h, z13.h\n"
+    "fmla z31.h, p3/M, z1.h, z5.h\n"
+    "ld1h { z1.h }, p3/Z, [x6]\n"
+    "fmla z28.h, p3/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x12, x13, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "fmla z30.h, p3/M, z2.h, z5.h\n"
+    "fmla z31.h, p3/M, z2.h, z6.h\n"
+    "ld1h { z2.h }, p3/Z, [x6, #1, MUL VL]\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11]\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z30.h, p3/M, z3.h, z6.h\n"
+    "fmla z31.h, p3/M, z3.h, z8.h\n"
+    "ld1h { z3.h }, p3/Z, [x6, #2, MUL VL]\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z14.h\n"
+    "fmla z30.h, p3/M, z4.h, z8.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x6, #3, MUL VL]\n"
+    "fmla z28.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z13.h\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z0.h, z12.h\n"
+    "fmla z28.h, p3/M, z1.h, z13.h\n"
+    "fmla z29.h, p3/M, z1.h, z5.h\n"
+    "fmla z30.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z1.h, z9.h\n"
+    "fmla z28.h, p3/M, z2.h, z5.h\n"
+    "fmla z29.h, p3/M, z2.h, z6.h\n"
+    "fmla z30.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x11, x13, LSL #1]\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z6.h\n"
+    "fmla z29.h, p3/M, z3.h, z8.h\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z8.h\n"
+    "fmla z29.h, p3/M, z4.h, z10.h\n"
+    "fmla z30.h, p3/M, z4.h, z12.h\n"
+    "fmla z31.h, p3/M, z4.h, z9.h\n"
+    ".inst 0xc170ca3c  // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+    "st1h { z28.h }, p0, [x23]\n"
+    "st1h { z29.h }, p0, [x23, x24, LSL #1]\n"
+    "st1h { z30.h }, p0, [x22]\n"
+    "st1h { z31.h }, p0, [x22, x24, LSL #1]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..313036876e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,537 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[36];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[0];
+      inptrs[1] = input_ptrs[1];
+      inptrs[2] = input_ptrs[6];
+      inptrs[3] = input_ptrs[7];
+      inptrs[4] = input_ptrs[2];
+      inptrs[5] = input_ptrs[8];
+      inptrs[6] = input_ptrs[3];
+      inptrs[7] = input_ptrs[4];
+      inptrs[8] = input_ptrs[11];
+      inptrs[9] = input_ptrs[12];
+      inptrs[10] = input_ptrs[9];
+      inptrs[11] = input_ptrs[10];
+      inptrs[12] = input_ptrs[5];
+      inptrs[13] = input_ptrs[13];
+      inptrs[14] = input_ptrs[14];
+      inptrs[15] = input_ptrs[15];
+      inptrs[16] = input_ptrs[16];
+      inptrs[17] = input_ptrs[17];
+      inptrs[18] = input_ptrs[18];
+      inptrs[19] = input_ptrs[19];
+      inptrs[20] = input_ptrs[20];
+      inptrs[21] = input_ptrs[21];
+      inptrs[22] = input_ptrs[22];
+      inptrs[23] = input_ptrs[23];
+      inptrs[24] = input_ptrs[24];
+      inptrs[25] = input_ptrs[25];
+      inptrs[26] = input_ptrs[26];
+      inptrs[27] = input_ptrs[27];
+      inptrs[28] = input_ptrs[28];
+      inptrs[29] = input_ptrs[29];
+      inptrs[30] = input_ptrs[30];
+      inptrs[31] = input_ptrs[31];
+      inptrs[32] = input_ptrs[32];
+      inptrs[33] = input_ptrs[33];
+      inptrs[34] = input_ptrs[34];
+      inptrs[35] = input_ptrs[35];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x15, #0x0\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "whilelt p3.h, XZR, %x[n_channels]\n"
+    "ptrue p2.b\n"
+    "cnth x13\n"
+    "ldp x12, x11, [x20, #0x0]\n"
+    "ldp x10, x9, [x20, #0x10]\n"
+    "cmp x13, %x[n_channels]\n"
+    "ld1rh { z18.h }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "sub x28, XZR, x13\n"
+    "ldp x27, x26, [x16, #0x0]\n"
+    "ld1h { z17.h }, p2/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "ldp x25, x24, [x16, #0x10]\n"
+    ".inst 0xa040a1c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "ldp x23, x22, [x16, #0x20]\n"
+    "ld1rh { z16.h }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z5.h }, p3/Z, [x27, x15, LSL #1]\n"
+    "ldp x21, x20, [x16, #0x30]\n"
+    "ld1h { z6.h }, p3/Z, [x26, x15, LSL #1]\n"
+    "ldp x27, x26, [x16, #0x40]\n"
+    "ld1h { z4.h }, p2/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "ld1h { z7.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "ld1h { z8.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "ld1h { z9.h }, p3/Z, [x23, x15, LSL #1]\n"
+    "ld1h { z13.h }, p3/Z, [x22, x15, LSL #1]\n"
+    "ld1h { z11.h }, p3/Z, [x21, x15, LSL #1]\n"
+    "ld1h { z12.h }, p3/Z, [x20, x15, LSL #1]\n"
+    "ld1h { z10.h }, p3/Z, [x27, x15, LSL #1]\n"
+    "ld1h { z14.h }, p3/Z, [x26, x15, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z28, z17\n fmla z28.h, p2/M, z0.h, z5.h\n"
+    "movprfx z29, z17\n fmla z29.h, p2/M, z0.h, z6.h\n"
+    "ldr x25, [x16, #0x50]\n"
+    "whilelt p1.h, x13, %x[n_channels]\n"
+    "movprfx z30, z17\n fmla z30.h, p2/M, z0.h, z7.h\n"
+    "movprfx z31, z17\n fmla z31.h, p2/M, z0.h, z8.h\n"
+    "ldr x24, [x16, #0x58]\n"
+    "ld1h { z0.h }, p2/Z, [x14]\n"
+    "ldr x23, [x16, #0x60]\n"
+    "inch x28\n"
+    "mov p0.b, p3.b\n"
+    "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "ldr x22, [x16, #0x68]\n"
+    "fmla z28.h, p2/M, z1.h, z6.h\n"
+    "fmla z29.h, p2/M, z1.h, z9.h\n"
+    "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "ldr x21, [x16, #0x70]\n"
+    "fmla z30.h, p2/M, z1.h, z8.h\n"
+    "fmla z31.h, p2/M, z1.h, z13.h\n"
+    "ld1h { z1.h }, p2/Z, [x14, #1, MUL VL]\n"
+    "ldr x20, [x16, #0x78]\n"
+    "ldr x27, [x16, #0x80]\n"
+    "fmla z28.h, p2/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x23, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z2.h, z11.h\n"
+    "ldr x26, [x16, #0x88]\n"
+    "fmla z30.h, p2/M, z2.h, z13.h\n"
+    "fmla z31.h, p2/M, z2.h, z5.h\n"
+    "ld1h { z2.h }, p2/Z, [x14, #2, MUL VL]\n"
+    "ldr x25, [x16, #0x90]\n"
+    "ldr x24, [x16, #0x98]\n"
+    "fmla z28.h, p2/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p3/Z, [x22, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z3.h, z12.h\n"
+    "ldr x23, [x16, #0xa0]\n"
+    "fmla z30.h, p2/M, z3.h, z5.h\n"
+    "fmla z31.h, p2/M, z3.h, z6.h\n"
+    "ld1h { z3.h }, p2/Z, [x14, #3, MUL VL]\n"
+    "ldr x22, [x16, #0xa8]\n"
+    "fmla z28.h, p2/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p3/Z, [x21, x15, LSL #1]\n"
+    "ldr x21, [x16, #0xb0]\n"
+    "fmla z29.h, p2/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x20, x15, LSL #1]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z30.h, p2/M, z4.h, z6.h\n"
+    "fmla z31.h, p2/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p2/Z, [x14, #4, MUL VL]\n"
+    "fmla z28.h, p2/M, z0.h, z7.h\n"
+    "fmla z29.h, p2/M, z0.h, z8.h\n"
+    "fmla z30.h, p2/M, z0.h, z14.h\n"
+    "fmla z31.h, p2/M, z0.h, z11.h\n"
+    "ld1h { z0.h }, p2/Z, [x14, #5, MUL VL]\n"
+    "fmla z28.h, p2/M, z1.h, z8.h\n"
+    "ld1h { z8.h }, p3/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0xc8]\n"
+    "fmla z29.h, p2/M, z1.h, z13.h\n"
+    "fmla z30.h, p2/M, z1.h, z11.h\n"
+    "fmla z31.h, p2/M, z1.h, z12.h\n"
+    "ld1h { z1.h }, p2/Z, [x14, #6, MUL VL]\n"
+    "fmla z28.h, p2/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p3/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0xc0]\n"
+    "fmla z29.h, p2/M, z2.h, z5.h\n"
+    "fmla z30.h, p2/M, z2.h, z12.h\n"
+    "fmla z31.h, p2/M, z2.h, z9.h\n"
+    "ld1h { z2.h }, p2/Z, [x14, #7, MUL VL]\n"
+    "addvl x14, x14, #16\n"
+    "ld1h { z17.h }, p2/Z, [x14, #4, MUL VL]\n"
+    "fmla z28.h, p2/M, z3.h, z5.h\n"
+    "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x16, #0xd0]\n"
+    "fmla z29.h, p2/M, z3.h, z6.h\n"
+    "fmla z30.h, p2/M, z3.h, z9.h\n"
+    "fmla z31.h, p2/M, z3.h, z13.h\n"
+    "ld1h { z3.h }, p2/Z, [x14, #-8, MUL VL]\n"
+    "fmla z28.h, p2/M, z4.h, z6.h\n"
+    "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "ldr x24, [x16, #0xd8]\n"
+    "fmla z29.h, p2/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p3/Z, [x23, x15, LSL #1]\n"
+    "ldr x23, [x16, #0xe0]\n"
+    "fmla z30.h, p2/M, z4.h, z13.h\n"
+    "fmla z31.h, p2/M, z4.h, z8.h\n"
+    "ld1h { z4.h }, p2/Z, [x14, #-7, MUL VL]\n"
+    "fmla z28.h, p2/M, z0.h, z14.h\n"
+    "ld1h { z14.h }, p3/Z, [x20, x15, LSL #1]\n"
+    "ldr x20, [x16, #0xf8]\n"
+    "fmla z29.h, p2/M, z0.h, z11.h\n"
+    "fmla z30.h, p2/M, z0.h, z5.h\n"
+    "fmla z31.h, p2/M, z0.h, z6.h\n"
+    "ld1h { z0.h }, p2/Z, [x14, #-6, MUL VL]\n"
+    "fmla z28.h, p2/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p3/Z, [x22, x15, LSL #1]\n"
+    "ldr x22, [x16, #0xe8]\n"
+    "fmla z29.h, p2/M, z1.h, z12.h\n"
+    "fmla z30.h, p2/M, z1.h, z6.h\n"
+    "fmla z31.h, p2/M, z1.h, z10.h\n"
+    "ld1h { z1.h }, p2/Z, [x14, #-5, MUL VL]\n"
+    "fmla z28.h, p2/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p3/Z, [x21, x15, LSL #1]\n"
+    "ldr x21, [x16, #0xf0]\n"
+    "fmla z29.h, p2/M, z2.h, z9.h\n"
+    "fmla z30.h, p2/M, z2.h, z10.h\n"
+    "fmla z31.h, p2/M, z2.h, z11.h\n"
+    "ld1h { z2.h }, p2/Z, [x14, #-4, MUL VL]\n"
+    "fmla z28.h, p2/M, z3.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0x100]\n"
+    "fmla z29.h, p2/M, z3.h, z13.h\n"
+    "fmla z30.h, p2/M, z3.h, z11.h\n"
+    "fmla z31.h, p2/M, z3.h, z12.h\n"
+    "ld1h { z3.h }, p2/Z, [x14, #-3, MUL VL]\n"
+    "fmla z28.h, p2/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p3/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0x108]\n"
+    "fmla z29.h, p2/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p3/Z, [x23, x15, LSL #1]\n"
+    "fmla z30.h, p2/M, z4.h, z12.h\n"
+    "fmla z31.h, p2/M, z4.h, z14.h\n"
+    "ld1h { z4.h }, p2/Z, [x14, #-2, MUL VL]\n"
+    "fmla z28.h, p2/M, z0.h, z5.h\n"
+    "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x16, #0x110]\n"
+    "fmla z29.h, p2/M, z0.h, z6.h\n"
+    "fmla z30.h, p2/M, z0.h, z9.h\n"
+    "fmla z31.h, p2/M, z0.h, z13.h\n"
+    "ld1h { z0.h }, p2/Z, [x14, #-1, MUL VL]\n"
+    "fmla z28.h, p2/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "ldr x24, [x16, #0x118]\n"
+    "fmla z29.h, p2/M, z1.h, z10.h\n"
+    "fmla z30.h, p2/M, z1.h, z13.h\n"
+    "fmla z31.h, p2/M, z1.h, z5.h\n"
+    "ld1h { z1.h }, p2/Z, [x14]\n"
+    "fmla z28.h, p2/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p3/Z, [x22, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z2.h, z11.h\n"
+    "fmla z30.h, p2/M, z2.h, z5.h\n"
+    "fmla z31.h, p2/M, z2.h, z6.h\n"
+    "ld1h { z2.h }, p2/Z, [x14, #1, MUL VL]\n"
+    "fmla z28.h, p2/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p3/Z, [x21, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z3.h, z12.h\n"
+    "fmla z30.h, p2/M, z3.h, z6.h\n"
+    "fmla z31.h, p2/M, z3.h, z8.h\n"
+    "ld1h { z3.h }, p2/Z, [x14, #2, MUL VL]\n"
+    "fmla z28.h, p2/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p3/Z, [x20, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z4.h, z14.h\n"
+    "fmla z30.h, p2/M, z4.h, z8.h\n"
+    "fmla z31.h, p2/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p2/Z, [x14, #3, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "fmla z28.h, p2/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x27, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z0.h, z13.h\n"
+    "fmla z30.h, p2/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p3/Z, [x26, x15, LSL #1]\n"
+    "ldp x27, x26, [x16, #0x0]\n"
+    "fmla z31.h, p2/M, z0.h, z12.h\n"
+    "fmla z28.h, p2/M, z1.h, z13.h\n"
+    "fmla z29.h, p2/M, z1.h, z5.h\n"
+    "fmla z30.h, p2/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "fmla z31.h, p2/M, z1.h, z9.h\n"
+    "fmla z28.h, p2/M, z2.h, z5.h\n"
+    "ld1h { z5.h }, p1/Z, [x27, x13, LSL #1]\n"
+    "fmla z29.h, p2/M, z2.h, z6.h\n"
+    "fmla z30.h, p2/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "ldp x25, x24, [x16, #0x10]\n"
+    "ldp x23, x22, [x16, #0x20]\n"
+    "inch x15\n"
+    "ldp x21, x20, [x16, #0x30]\n"
+    "whilelt p3.h, x15, %x[n_channels]\n"
+    "fmla z31.h, p2/M, z2.h, z11.h\n"
+    "fmla z28.h, p2/M, z3.h, z6.h\n"
+    "ld1h { z6.h }, p1/Z, [x26, x13, LSL #1]\n"
+    "ldp x27, x26, [x16, #0x40]\n"
+    "fmla z29.h, p2/M, z3.h, z8.h\n"
+    "fmla z30.h, p2/M, z3.h, z11.h\n"
+    "ld1h { z7.h }, p1/Z, [x25, x13, LSL #1]\n"
+    "ld1h { z13.h }, p1/Z, [x22, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z3.h, z12.h\n"
+    "fmla z28.h, p2/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p1/Z, [x24, x13, LSL #1]\n"
+    "fmla z29.h, p2/M, z4.h, z10.h\n"
+    "fmla z30.h, p2/M, z4.h, z12.h\n"
+    "ld1h { z11.h }, p1/Z, [x21, x13, LSL #1]\n"
+    "ld1h { z12.h }, p1/Z, [x20, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p1/Z, [x23, x13, LSL #1]\n"
+    "ld1h { z10.h }, p1/Z, [x27, x13, LSL #1]\n"
+    "ld1h { z14.h }, p1/Z, [x26, x13, LSL #1]\n"
+    "inch x13\n"
+    ".inst 0xa040a1c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "cmp x13, %x[n_channels]\n"
+    ".inst 0xc170ca5c  // fclamp { z28.h-z31.h }, z18.h, z16.h\n"
+    "ld1h { z4.h }, p2/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "st1h { z28.h }, p0, [x12, x28, LSL #1]\n"
+    "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
+    "st1h { z30.h }, p0, [x10, x28, LSL #1]\n"
+    "st1h { z31.h }, p0, [x9, x28, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z28, z17\n fmla z28.h, p2/M, z0.h, z5.h\n"
+    "movprfx z29, z17\n fmla z29.h, p2/M, z0.h, z6.h\n"
+    "ldr x25, [x16, #0x50]\n"
+    "inch x28\n"
+    "movprfx z30, z17\n fmla z30.h, p2/M, z0.h, z7.h\n"
+    "movprfx z31, z17\n fmla z31.h, p2/M, z0.h, z8.h\n"
+    "ldr x24, [x16, #0x58]\n"
+    "ld1h { z0.h }, p2/Z, [x14]\n"
+    "ldr x23, [x16, #0x60]\n"
+    "mov p0.b, p3.b\n"
+    "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "ldr x22, [x16, #0x68]\n"
+    "fmla z28.h, p2/M, z1.h, z6.h\n"
+    "fmla z29.h, p2/M, z1.h, z9.h\n"
+    "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "ldr x21, [x16, #0x70]\n"
+    "fmla z30.h, p2/M, z1.h, z8.h\n"
+    "fmla z31.h, p2/M, z1.h, z13.h\n"
+    "ld1h { z1.h }, p2/Z, [x14, #1, MUL VL]\n"
+    "ldr x20, [x16, #0x78]\n"
+    "ldr x27, [x16, #0x80]\n"
+    "fmla z28.h, p2/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x23, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z2.h, z11.h\n"
+    "ldr x26, [x16, #0x88]\n"
+    "fmla z30.h, p2/M, z2.h, z13.h\n"
+    "fmla z31.h, p2/M, z2.h, z5.h\n"
+    "ld1h { z2.h }, p2/Z, [x14, #2, MUL VL]\n"
+    "ldr x25, [x16, #0x90]\n"
+    "ldr x24, [x16, #0x98]\n"
+    "fmla z28.h, p2/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p3/Z, [x22, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z3.h, z12.h\n"
+    "ldr x23, [x16, #0xa0]\n"
+    "fmla z30.h, p2/M, z3.h, z5.h\n"
+    "fmla z31.h, p2/M, z3.h, z6.h\n"
+    "ld1h { z3.h }, p2/Z, [x14, #3, MUL VL]\n"
+    "ldr x22, [x16, #0xa8]\n"
+    "fmla z28.h, p2/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p3/Z, [x21, x15, LSL #1]\n"
+    "ldr x21, [x16, #0xb0]\n"
+    "fmla z29.h, p2/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x20, x15, LSL #1]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z30.h, p2/M, z4.h, z6.h\n"
+    "fmla z31.h, p2/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p2/Z, [x14, #4, MUL VL]\n"
+    "fmla z28.h, p2/M, z0.h, z7.h\n"
+    "fmla z29.h, p2/M, z0.h, z8.h\n"
+    "fmla z30.h, p2/M, z0.h, z14.h\n"
+    "fmla z31.h, p2/M, z0.h, z11.h\n"
+    "ld1h { z0.h }, p2/Z, [x14, #5, MUL VL]\n"
+    "fmla z28.h, p2/M, z1.h, z8.h\n"
+    "ld1h { z8.h }, p3/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0xc8]\n"
+    "fmla z29.h, p2/M, z1.h, z13.h\n"
+    "fmla z30.h, p2/M, z1.h, z11.h\n"
+    "fmla z31.h, p2/M, z1.h, z12.h\n"
+    "ld1h { z1.h }, p2/Z, [x14, #6, MUL VL]\n"
+    "fmla z28.h, p2/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p3/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0xc0]\n"
+    "fmla z29.h, p2/M, z2.h, z5.h\n"
+    "fmla z30.h, p2/M, z2.h, z12.h\n"
+    "fmla z31.h, p2/M, z2.h, z9.h\n"
+    "ld1h { z2.h }, p2/Z, [x14, #7, MUL VL]\n"
+    "addvl x14, x14, #16\n"
+    "fmla z28.h, p2/M, z3.h, z5.h\n"
+    "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x16, #0xd0]\n"
+    "fmla z29.h, p2/M, z3.h, z6.h\n"
+    "fmla z30.h, p2/M, z3.h, z9.h\n"
+    "fmla z31.h, p2/M, z3.h, z13.h\n"
+    "ld1h { z3.h }, p2/Z, [x14, #-8, MUL VL]\n"
+    "fmla z28.h, p2/M, z4.h, z6.h\n"
+    "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "ldr x24, [x16, #0xd8]\n"
+    "fmla z29.h, p2/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p3/Z, [x23, x15, LSL #1]\n"
+    "ldr x23, [x16, #0xe0]\n"
+    "fmla z30.h, p2/M, z4.h, z13.h\n"
+    "fmla z31.h, p2/M, z4.h, z8.h\n"
+    "ld1h { z4.h }, p2/Z, [x14, #-7, MUL VL]\n"
+    "fmla z28.h, p2/M, z0.h, z14.h\n"
+    "ld1h { z14.h }, p3/Z, [x20, x15, LSL #1]\n"
+    "ldr x20, [x16, #0xf8]\n"
+    "fmla z29.h, p2/M, z0.h, z11.h\n"
+    "fmla z30.h, p2/M, z0.h, z5.h\n"
+    "fmla z31.h, p2/M, z0.h, z6.h\n"
+    "ld1h { z0.h }, p2/Z, [x14, #-6, MUL VL]\n"
+    "fmla z28.h, p2/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p3/Z, [x22, x15, LSL #1]\n"
+    "ldr x22, [x16, #0xe8]\n"
+    "fmla z29.h, p2/M, z1.h, z12.h\n"
+    "fmla z30.h, p2/M, z1.h, z6.h\n"
+    "fmla z31.h, p2/M, z1.h, z10.h\n"
+    "ld1h { z1.h }, p2/Z, [x14, #-5, MUL VL]\n"
+    "fmla z28.h, p2/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p3/Z, [x21, x15, LSL #1]\n"
+    "ldr x21, [x16, #0xf0]\n"
+    "fmla z29.h, p2/M, z2.h, z9.h\n"
+    "fmla z30.h, p2/M, z2.h, z10.h\n"
+    "fmla z31.h, p2/M, z2.h, z11.h\n"
+    "ld1h { z2.h }, p2/Z, [x14, #-4, MUL VL]\n"
+    "fmla z28.h, p2/M, z3.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0x100]\n"
+    "fmla z29.h, p2/M, z3.h, z13.h\n"
+    "fmla z30.h, p2/M, z3.h, z11.h\n"
+    "fmla z31.h, p2/M, z3.h, z12.h\n"
+    "ld1h { z3.h }, p2/Z, [x14, #-3, MUL VL]\n"
+    "fmla z28.h, p2/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p3/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0x108]\n"
+    "fmla z29.h, p2/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p3/Z, [x23, x15, LSL #1]\n"
+    "fmla z30.h, p2/M, z4.h, z12.h\n"
+    "fmla z31.h, p2/M, z4.h, z14.h\n"
+    "ld1h { z4.h }, p2/Z, [x14, #-2, MUL VL]\n"
+    "fmla z28.h, p2/M, z0.h, z5.h\n"
+    "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x16, #0x110]\n"
+    "fmla z29.h, p2/M, z0.h, z6.h\n"
+    "fmla z30.h, p2/M, z0.h, z9.h\n"
+    "fmla z31.h, p2/M, z0.h, z13.h\n"
+    "ld1h { z0.h }, p2/Z, [x14, #-1, MUL VL]\n"
+    "fmla z28.h, p2/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "ldr x24, [x16, #0x118]\n"
+    "fmla z29.h, p2/M, z1.h, z10.h\n"
+    "fmla z30.h, p2/M, z1.h, z13.h\n"
+    "fmla z31.h, p2/M, z1.h, z5.h\n"
+    "ld1h { z1.h }, p2/Z, [x14]\n"
+    "fmla z28.h, p2/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p3/Z, [x22, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z2.h, z11.h\n"
+    "fmla z30.h, p2/M, z2.h, z5.h\n"
+    "fmla z31.h, p2/M, z2.h, z6.h\n"
+    "ld1h { z2.h }, p2/Z, [x14, #1, MUL VL]\n"
+    "fmla z28.h, p2/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p3/Z, [x21, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z3.h, z12.h\n"
+    "fmla z30.h, p2/M, z3.h, z6.h\n"
+    "fmla z31.h, p2/M, z3.h, z8.h\n"
+    "ld1h { z3.h }, p2/Z, [x14, #2, MUL VL]\n"
+    "fmla z28.h, p2/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p3/Z, [x20, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z4.h, z14.h\n"
+    "fmla z30.h, p2/M, z4.h, z8.h\n"
+    "fmla z31.h, p2/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p2/Z, [x14, #3, MUL VL]\n"
+    "fmla z28.h, p2/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x27, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z0.h, z13.h\n"
+    "fmla z30.h, p2/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p3/Z, [x26, x15, LSL #1]\n"
+    "fmla z31.h, p2/M, z0.h, z12.h\n"
+    "fmla z28.h, p2/M, z1.h, z13.h\n"
+    "fmla z29.h, p2/M, z1.h, z5.h\n"
+    "fmla z30.h, p2/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "fmla z31.h, p2/M, z1.h, z9.h\n"
+    "fmla z28.h, p2/M, z2.h, z5.h\n"
+    "fmla z29.h, p2/M, z2.h, z6.h\n"
+    "fmla z30.h, p2/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "fmla z31.h, p2/M, z2.h, z11.h\n"
+    "fmla z28.h, p2/M, z3.h, z6.h\n"
+    "fmla z29.h, p2/M, z3.h, z8.h\n"
+    "fmla z30.h, p2/M, z3.h, z11.h\n"
+    "fmla z31.h, p2/M, z3.h, z12.h\n"
+    "fmla z28.h, p2/M, z4.h, z8.h\n"
+    "fmla z29.h, p2/M, z4.h, z10.h\n"
+    "fmla z30.h, p2/M, z4.h, z12.h\n"
+    "fmla z31.h, p2/M, z4.h, z9.h\n"
+    ".inst 0xc170ca5c  // fclamp { z28.h-z31.h }, z18.h, z16.h\n"
+    "st1h { z28.h }, p0, [x12, x28, LSL #1]\n"
+    "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
+    "st1h { z30.h }, p0, [x10, x28, LSL #1]\n"
+    "st1h { z31.h }, p0, [x9, x28, LSL #1]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..25d83f15c3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..96cfd5e497
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,336 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "mov x4, #0x0\n"
+    "mov x5, #0x0\n"
+    "1:"  // Tile loop
+    "str x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x2\n"
+    "str x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "mul x20, x4, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "madd x20, x5, x6, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "add x7, x7, x20, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x8, x7, x21, LSL #2\n"
+    "add x17, x8, x21, LSL #2\n"
+    "add x16, x6, x6\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x14, x17, x21, LSL #2\n"
+    "add x13, x16, x6\n"
+    "cbnz x5, 2f\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "sub x21, x20, x5\n"
+    "sub x21, x21, #0x1\n"
+    "lsl x12, %x[n_channels], #0x2\n"
+    "mov x20, #0x8\n"
+    "and x21, x21, #0x3fffff\n"
+    "mul x20, x20, x6\n"
+    "orr x12, x12, x21, LSL #22\n"
+    "orr x12, x12, x20, LSL #38\n"
+    "add x11, x8, x6, LSL #2\n"
+    "add x10, x7, x13, LSL #2\n"
+    "add x9, x8, x16, LSL #2\n"
+    "add x28, x17, x6, LSL #2\n"
+    "add x27, x14, x13, LSL #2\n"
+    "add x26, x7, x6, LSL #2\n"
+    "add x25, x7, x16, LSL #2\n"
+    "add x24, x17, x16, LSL #2\n"
+    "add x23, x8, x13, LSL #2\n"
+    "add x22, x17, x13, LSL #2\n"
+    "add x21, x14, x6, LSL #2\n"
+    "add x20, x14, x16, LSL #2\n"
+    ".inst 0xf8ac497a  // rprfm pldonce, x12, [x11]\n"
+    ".inst 0xf8ac48fa  // rprfm pldonce, x12, [x7]\n"
+    ".inst 0xf8ac495a  // rprfm pldonce, x12, [x10]\n"
+    ".inst 0xf8ac493a  // rprfm pldonce, x12, [x9]\n"
+    ".inst 0xf8ac4b9a  // rprfm pldonce, x12, [x28]\n"
+    ".inst 0xf8ac49da  // rprfm pldonce, x12, [x14]\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    ".inst 0xf8ac491a  // rprfm pldonce, x12, [x8]\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    ".inst 0xf8ac4a3a  // rprfm pldonce, x12, [x17]\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x4, x22\n"  // offset = tile_i * ld_output_row
+    "mov x20, #0x2\n"
+    "ld1w { z22.s }, p3/Z, [x15]\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "madd x21, x5, x25, x21\n"  // offset += tile_j * ld_output_col
+    "addvl x15, x15, #1\n"
+    ".inst 0xa040c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "mul x21, x21, x20\n"  // offset *= output_tile_size
+    "cntw x23\n"
+    "ld1rw { z21.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "addvl x15, x15, #4\n"
+    "add x24, x24, x21, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    ".inst 0xa040c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "addvl x15, x15, #4\n"
+    "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "cmp x23, %x[n_channels]\n"
+    "add x22, x24, x22, LSL #2\n"
+    "ld1w { z8.s }, p3/Z, [x15]\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x23\n"
+    "ld1w { z9.s }, p2/Z, [x8, x6, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x7]\n"
+    "addvl x15, x15, #1\n"
+    "ld1w { z11.s }, p2/Z, [x7, x13, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x8, x16, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x17, x6, LSL #2]\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z28, z22\n fmla z28.s, p3/M, z4.s, z9.s\n"
+    "movprfx z29, z22\n fmla z29.s, p3/M, z3.s, z9.s\n"
+    "whilelt p1.s, x23, %x[n_channels]\n"
+    "incw x21\n"
+    "movprfx z30, z22\n fmla z30.s, p3/M, z1.s, z9.s\n"
+    "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z18.s }, p2/Z, [x14]\n"
+    "incw x23\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z17.s }, p2/Z, [x14, x13, LSL #2]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x17, x16, LSL #2]\n"
+    "incw x20\n"
+    "fmla z28.s, p3/M, z5.s, z12.s\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z11.s }, p2/Z, [x7, x6, LSL #2]\n"
+    "fmla z30.s, p3/M, z6.s, z18.s\n"
+    "fmla z31.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z10.s }, p2/Z, [x7, x16, LSL #2]\n"
+    "addvl x7, x7, #1\n"
+    "fmla z28.s, p3/M, z7.s, z13.s\n"
+    "fmla z29.s, p3/M, z6.s, z13.s\n"
+    "ld1w { z22.s }, p3/Z, [x15]\n"
+    "addvl x15, x15, #1\n"
+    "fmla z30.s, p3/M, z4.s, z13.s\n"
+    "fmla z31.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z9.s }, p2/Z, [x8]\n"
+    "fmla z28.s, p3/M, z1.s, z11.s\n"
+    "fmla z29.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z19.s }, p2/Z, [x8, x13, LSL #2]\n"
+    "addvl x8, x8, #1\n"
+    "fmla z30.s, p3/M, z5.s, z16.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "fmla z28.s, p3/M, z2.s, z10.s\n"
+    "fmla z29.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z18.s }, p2/Z, [x17]\n"
+    "fmla z30.s, p3/M, z0.s, z9.s\n"
+    "fmla z31.s, p3/M, z2.s, z19.s\n"
+    "fmla z28.s, p3/M, z8.s, z16.s\n"
+    "fmla z29.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x17, x13, LSL #2]\n"
+    "addvl x17, x17, #1\n"
+    "fmla z30.s, p3/M, z3.s, z18.s\n"
+    "fmla z31.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z13.s }, p1/Z, [x17, x6, LSL #2]\n"
+    "fmla z28.s, p3/M, z3.s, z9.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x6, LSL #2]\n"
+    "fmla z29.s, p3/M, z5.s, z19.s\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "fmla z31.s, p3/M, z6.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x16, LSL #2]\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "fmla z29.s, p3/M, z8.s, z17.s\n"
+    ".inst 0xa040c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
+    "addvl x15, x15, #4\n"
+    "fmla z30.s, p3/M, z8.s, z16.s\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    ".inst 0xa040c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
+    "addvl x15, x15, #4\n"
+    "cmp x23, %x[n_channels]\n"
+    ".inst 0xc1aecabc  // fclamp { z28.s-z31.s }, z21.s, z14.s\n"
+    "addvl x14, x14, #1\n"
+    "ld1w { z9.s }, p1/Z, [x8, x6, LSL #2]\n"
+    "ld1w { z10.s }, p1/Z, [x7]\n"
+    "st1w { z28.s }, p0, [x24]\n"
+    "ld1w { z11.s }, p1/Z, [x7, x13, LSL #2]\n"
+    "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
+    "addvl x24, x24, #1\n"
+    "ld1w { z12.s }, p1/Z, [x8, x16, LSL #2]\n"
+    "st1w { z30.s }, p0, [x22]\n"
+    "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
+    "addvl x22, x22, #1\n"
+    "ld1w { z8.s }, p3/Z, [x15]\n"
+    "addvl x15, x15, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z24, z22\n fmla z24.s, p3/M, z4.s, z9.s\n"
+    "movprfx z25, z22\n fmla z25.s, p3/M, z3.s, z9.s\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "add x5, x5, #0x1\n"
+    "movprfx z26, z22\n fmla z26.s, p3/M, z1.s, z9.s\n"
+    "movprfx z27, z22\n fmla z27.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z17.s }, p2/Z, [x14]\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "fmla z25.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "fmla z26.s, p3/M, z2.s, z12.s\n"
+    "fmla z27.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z20.s }, p2/Z, [x17, x16, LSL #2]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z24.s, p3/M, z5.s, z12.s\n"
+    "fmla z25.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z18.s }, p2/Z, [x7, x6, LSL #2]\n"
+    "cmp x5, x20\n"
+    "fmla z26.s, p3/M, z6.s, z17.s\n"
+    "fmla z27.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x7, x16, LSL #2]\n"
+    "add x20, x4, #0x1\n"
+    "fmla z24.s, p3/M, z7.s, z13.s\n"
+    "fmla z25.s, p3/M, z6.s, z13.s\n"
+    "csel x4, x4, x20, LT\n"
+    "mov p0.b, p2.b\n"
+    "fmla z26.s, p3/M, z4.s, z13.s\n"
+    "fmla z27.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x8]\n"
+    "csel x5, x5, XZR, LT\n"
+    "fmla z24.s, p3/M, z1.s, z18.s\n"
+    "fmla z25.s, p3/M, z0.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x8, x13, LSL #2]\n"
+    "cmp x4, x21\n"
+    "fmla z26.s, p3/M, z5.s, z20.s\n"
+    "fmla z27.s, p3/M, z4.s, z20.s\n"
+    "fmla z24.s, p3/M, z2.s, z17.s\n"
+    "fmla z25.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x17]\n"
+    "fmla z26.s, p3/M, z0.s, z16.s\n"
+    "fmla z27.s, p3/M, z2.s, z19.s\n"
+    "fmla z24.s, p3/M, z8.s, z20.s\n"
+    "fmla z25.s, p3/M, z7.s, z20.s\n"
+    "ld1w { z17.s }, p2/Z, [x17, x13, LSL #2]\n"
+    "fmla z26.s, p3/M, z3.s, z18.s\n"
+    "fmla z27.s, p3/M, z5.s, z17.s\n"
+    "fmla z24.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x6, LSL #2]\n"
+    "fmla z25.s, p3/M, z5.s, z19.s\n"
+    "fmla z26.s, p3/M, z7.s, z16.s\n"
+    "fmla z27.s, p3/M, z6.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x16, LSL #2]\n"
+    "fmla z24.s, p3/M, z6.s, z18.s\n"
+    "fmla z25.s, p3/M, z8.s, z17.s\n"
+    "fmla z26.s, p3/M, z8.s, z16.s\n"
+    "fmla z27.s, p3/M, z7.s, z16.s\n"
+    ".inst 0xc1aecab8  // fclamp { z24.s-z27.s }, z21.s, z14.s\n"
+    "st1w { z24.s }, p0, [x24]\n"
+    "st1w { z25.s }, p0, [x24, x25, LSL #2]\n"
+    "st1w { z26.s }, p0, [x22]\n"
+    "st1w { z27.s }, p0, [x22, x25, LSL #2]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..39f1b3635f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[16];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[5];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[3];
+      inptrs[3] = input_ptrs[6];
+      inptrs[4] = input_ptrs[9];
+      inptrs[5] = input_ptrs[12];
+      inptrs[6] = input_ptrs[15];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[2];
+      inptrs[9] = input_ptrs[10];
+      inptrs[10] = input_ptrs[4];
+      inptrs[11] = input_ptrs[7];
+      inptrs[12] = input_ptrs[8];
+      inptrs[13] = input_ptrs[11];
+      inptrs[14] = input_ptrs[13];
+      inptrs[15] = input_ptrs[14];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ptrue p3.b\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "ld1w { z23.s }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "ldp x13, x12, [x20, #0x0]\n"
+    "cntw x11\n"
+    ".inst 0xa040c1c0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "ldp x10, x9, [x20, #0x10]\n"
+    "mov x28, #0x0\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    ".inst 0xa040c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "addvl x14, x14, #4\n"
+    "cmp x11, %x[n_channels]\n"
+    "ld1rw { z22.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "sub x27, XZR, x11\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ld1w { z8.s }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "ld1w { z9.s }, p2/Z, [x24, x28, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x23, x28, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x22, x28, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z28, z23\n fmla z28.s, p3/M, z4.s, z9.s\n"
+    "movprfx z29, z23\n fmla z29.s, p3/M, z3.s, z9.s\n"
+    "ldr x20, [x15, #0x28]\n"
+    "whilelt p1.s, x11, %x[n_channels]\n"
+    "movprfx z30, z23\n fmla z30.s, p3/M, z1.s, z9.s\n"
+    "movprfx z31, z23\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z19.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ldr x20, [x15, #0x30]\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "ldr x21, [x15, #0x38]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "ldr x20, [x15, #0x48]\n"
+    "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "fmla z28.s, p3/M, z5.s, z12.s\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ldr x20, [x15, #0x40]\n"
+    "fmla z30.s, p3/M, z6.s, z19.s\n"
+    "fmla z31.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z25.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ldr x21, [x15, #0x50]\n"
+    "fmla z28.s, p3/M, z7.s, z13.s\n"
+    "fmla z29.s, p3/M, z6.s, z13.s\n"
+    "ldr x20, [x15, #0x58]\n"
+    "ld1w { z23.s }, p3/Z, [x14]\n"
+    "fmla z30.s, p3/M, z4.s, z13.s\n"
+    "fmla z31.s, p3/M, z8.s, z18.s\n"
+    "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ldr x21, [x15, #0x60]\n"
+    "fmla z28.s, p3/M, z1.s, z16.s\n"
+    "fmla z29.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ldr x20, [x15, #0x68]\n"
+    "fmla z30.s, p3/M, z5.s, z17.s\n"
+    "fmla z31.s, p3/M, z4.s, z17.s\n"
+    "ldr x26, [x15, #0x70]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z28.s, p3/M, z2.s, z25.s\n"
+    "fmla z29.s, p3/M, z1.s, z25.s\n"
+    "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ldr x25, [x15, #0x78]\n"
+    "fmla z30.s, p3/M, z0.s, z11.s\n"
+    "fmla z31.s, p3/M, z2.s, z19.s\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "incw x27\n"
+    "fmla z28.s, p3/M, z8.s, z17.s\n"
+    "fmla z29.s, p3/M, z7.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "fmla z30.s, p3/M, z3.s, z18.s\n"
+    "fmla z31.s, p3/M, z5.s, z17.s\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ld1w { z13.s }, p1/Z, [x20, x11, LSL #2]\n"
+    "fmla z28.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z16.s }, p2/Z, [x26, x28, LSL #2]\n"
+    "fmla z29.s, p3/M, z5.s, z19.s\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "fmla z31.s, p3/M, z6.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x28, LSL #2]\n"
+    "incw x28\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "fmla z29.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z9.s }, p1/Z, [x24, x11, LSL #2]\n"
+    "whilelt p2.s, x28, %x[n_channels]\n"
+    "fmla z30.s, p3/M, z8.s, z16.s\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z10.s }, p1/Z, [x23, x11, LSL #2]\n"
+    "ld1w { z11.s }, p1/Z, [x22, x11, LSL #2]\n"
+    ".inst 0xc1afcadc  // fclamp { z28.s-z31.s }, z22.s, z15.s\n"
+    "st1w { z28.s }, p0, [x13, x27, LSL #2]\n"
+    "ld1w { z12.s }, p1/Z, [x21, x11, LSL #2]\n"
+    "incw x11\n"
+    "cmp x11, %x[n_channels]\n"
+    "st1w { z29.s }, p0, [x12, x27, LSL #2]\n"
+    ".inst 0xa040c1c0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "st1w { z30.s }, p0, [x10, x27, LSL #2]\n"
+    ".inst 0xa040c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "st1w { z31.s }, p0, [x9, x27, LSL #2]\n"
+    "ld1w { z8.s }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z28, z23\n fmla z28.s, p3/M, z4.s, z9.s\n"
+    "movprfx z29, z23\n fmla z29.s, p3/M, z3.s, z9.s\n"
+    "ldr x20, [x15, #0x28]\n"
+    "incw x27\n"
+    "movprfx z30, z23\n fmla z30.s, p3/M, z1.s, z9.s\n"
+    "movprfx z31, z23\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ldr x20, [x15, #0x30]\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "ldr x21, [x15, #0x38]\n"
+    "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "ldr x20, [x15, #0x48]\n"
+    "ld1w { z20.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "fmla z28.s, p3/M, z5.s, z12.s\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ldr x20, [x15, #0x40]\n"
+    "fmla z30.s, p3/M, z6.s, z17.s\n"
+    "fmla z31.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ldr x20, [x15, #0x50]\n"
+    "fmla z28.s, p3/M, z7.s, z13.s\n"
+    "fmla z29.s, p3/M, z6.s, z13.s\n"
+    "ldr x21, [x15, #0x58]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.s, p3/M, z4.s, z13.s\n"
+    "fmla z31.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ldr x20, [x15, #0x60]\n"
+    "fmla z28.s, p3/M, z1.s, z18.s\n"
+    "fmla z29.s, p3/M, z0.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ldr x22, [x15, #0x68]\n"
+    "fmla z30.s, p3/M, z5.s, z20.s\n"
+    "fmla z31.s, p3/M, z4.s, z20.s\n"
+    "ldr x21, [x15, #0x70]\n"
+    "fmla z28.s, p3/M, z2.s, z17.s\n"
+    "fmla z29.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "fmla z30.s, p3/M, z0.s, z16.s\n"
+    "fmla z31.s, p3/M, z2.s, z19.s\n"
+    "fmla z28.s, p3/M, z8.s, z20.s\n"
+    "fmla z29.s, p3/M, z7.s, z20.s\n"
+    "ld1w { z17.s }, p2/Z, [x22, x28, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z18.s\n"
+    "fmla z31.s, p3/M, z5.s, z17.s\n"
+    "fmla z28.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "fmla z29.s, p3/M, z5.s, z19.s\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "fmla z31.s, p3/M, z6.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "fmla z29.s, p3/M, z8.s, z17.s\n"
+    "fmla z30.s, p3/M, z8.s, z16.s\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    ".inst 0xc1afcadc  // fclamp { z28.s-z31.s }, z22.s, z15.s\n"
+    "st1w { z28.s }, p0, [x13, x27, LSL #2]\n"
+    "st1w { z29.s }, p0, [x12, x27, LSL #2]\n"
+    "st1w { z30.s }, p0, [x10, x27, LSL #2]\n"
+    "st1w { z31.s }, p0, [x9, x27, LSL #2]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..bd330dc21e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..d15a3a8377
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "mov x2, #0x0\n"
+    "mov x3, #0x0\n"
+    "1:"  // Tile loop
+    "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x3\n"
+    "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "mul x20, x2, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "madd x20, x3, x4, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "add x5, x5, x20, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x6, x5, x21, LSL #2\n"
+    "add x7, x6, x21, LSL #2\n"
+    "add x8, x4, x4\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x16, x7, x21, LSL #2\n"
+    "add x15, x8, x4\n"
+    "add x14, x16, x21, LSL #2\n"
+    "add x13, x15, x4\n"
+    "cbnz x3, 2f\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "sub x21, x20, x3\n"
+    "sub x21, x21, #0x1\n"
+    "lsl x12, %x[n_channels], #0x2\n"
+    "mov x20, #0xc\n"
+    "and x21, x21, #0x3fffff\n"
+    "mul x20, x20, x4\n"
+    "orr x12, x12, x21, LSL #22\n"
+    "orr x12, x12, x20, LSL #38\n"
+    "add x27, x7, x8, LSL #2\n"
+    "add x26, x5, x13, LSL #2\n"
+    "add x25, x6, x8, LSL #2\n"
+    "add x24, x14, x13, LSL #2\n"
+    "add x23, x7, x4, LSL #2\n"
+    "add x22, x5, x4, LSL #2\n"
+    "add x21, x5, x15, LSL #2\n"
+    "add x20, x7, x15, LSL #2\n"
+    "add x11, x6, x13, LSL #2\n"
+    "add x10, x16, x8, LSL #2\n"
+    "add x9, x16, x13, LSL #2\n"
+    "add x28, x14, x4, LSL #2\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    "add x27, x6, x4, LSL #2\n"
+    ".inst 0xf8ac48ba  // rprfm pldonce, x12, [x5]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    "add x26, x6, x15, LSL #2\n"
+    ".inst 0xf8ac49da  // rprfm pldonce, x12, [x14]\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    "add x25, x14, x15, LSL #2\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    "add x24, x16, x4, LSL #2\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    "add x23, x5, x8, LSL #2\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    "add x22, x16, x15, LSL #2\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    "add x21, x7, x13, LSL #2\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "add x20, x14, x8, LSL #2\n"
+    ".inst 0xf8ac48da  // rprfm pldonce, x12, [x6]\n"
+    ".inst 0xf8ac497a  // rprfm pldonce, x12, [x11]\n"
+    ".inst 0xf8ac4a1a  // rprfm pldonce, x12, [x16]\n"
+    ".inst 0xf8ac495a  // rprfm pldonce, x12, [x10]\n"
+    ".inst 0xf8ac493a  // rprfm pldonce, x12, [x9]\n"
+    ".inst 0xf8ac4b9a  // rprfm pldonce, x12, [x28]\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    ".inst 0xf8ac48fa  // rprfm pldonce, x12, [x7]\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x2, x22\n"  // offset = tile_i * ld_output_row
+    "mov x20, #0x3\n"
+    "ld1w { z24.s }, p3/Z, [x17]\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "madd x21, x3, x27, x21\n"  // offset += tile_j * ld_output_col
+    "mul x21, x21, x20\n"  // offset *= output_tile_size
+    "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ldr x26, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "addvl x17, x17, #1\n"
+    "add x26, x26, x21, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    ".inst 0xa040c220  // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+    "cntw x25\n"
+    "addvl x17, x17, #4\n"
+    ".inst 0xa040c224  // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+    "add x24, x26, x22, LSL #2\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "addvl x17, x17, #4\n"
+    "cmp x25, %x[n_channels]\n"
+    "ld1w { z8.s }, p3/Z, [x17]\n"
+    "add x23, x24, x22, LSL #2\n"
+    "add x22, x27, x27\n"
+    "ld1w { z9.s }, p2/Z, [x7, x8, LSL #2]\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x25\n"
+    "ld1w { z10.s }, p2/Z, [x5]\n"
+    "ld1w { z11.s }, p2/Z, [x5, x13, LSL #2]\n"
+    "addvl x17, x17, #1\n"
+    "ld1w { z12.s }, p2/Z, [x14]\n"
+    "ld1w { z13.s }, p2/Z, [x6, x8, LSL #2]\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z28, z24\n fmla z28.s, p3/M, z7.s, z9.s\n"
+    "movprfx z27, z24\n fmla z27.s, p3/M, z8.s, z9.s\n"
+    "whilelt p1.s, x25, %x[n_channels]\n"
+    "incw x21\n"
+    "movprfx z29, z24\n fmla z29.s, p3/M, z6.s, z9.s\n"
+    "fmla z28.s, p3/M, z4.s, z13.s\n"
+    "incw x25\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z30, z24\n fmla z30.s, p3/M, z5.s, z9.s\n"
+    "movprfx z31, z24\n fmla z31.s, p3/M, z4.s, z9.s\n"
+    "incw x20\n"
+    "movprfx z20, z24\n fmla z20.s, p3/M, z3.s, z9.s\n"
+    "fmla z27.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x7, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z19.s }, p2/Z, [x7, x4, LSL #2]\n"
+    "movprfx z21, z24\n fmla z21.s, p3/M, z2.s, z9.s\n"
+    "fmla z28.s, p3/M, z6.s, z19.s\n"
+    "movprfx z23, z24\n fmla z23.s, p3/M, z0.s, z9.s\n"
+    "fmla z27.s, p3/M, z5.s, z13.s\n"
+    "fmla z29.s, p3/M, z3.s, z13.s\n"
+    "fmla z30.s, p3/M, z2.s, z13.s\n"
+    "fmla z31.s, p3/M, z1.s, z13.s\n"
+    "fmla z20.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z18.s }, p2/Z, [x5, x4, LSL #2]\n"
+    "fmla z21.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z15.s }, p2/Z, [x14, x13, LSL #2]\n"
+    "movprfx z22, z24\n fmla z22.s, p3/M, z1.s, z9.s\n"
+    "fmla z28.s, p3/M, z0.s, z18.s\n"
+    "fmla z23.s, p3/M, z8.s, z15.s\n"
+    "fmla z27.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x5, x15, LSL #2]\n"
+    "fmla z22.s, p3/M, z0.s, z19.s\n"
+    "fmla z30.s, p3/M, z4.s, z19.s\n"
+    "ld1w { z24.s }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "fmla z31.s, p3/M, z3.s, z19.s\n"
+    "fmla z21.s, p3/M, z1.s, z19.s\n"
+    "ld1w { z17.s }, p2/Z, [x6]\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z29.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x16]\n"
+    "fmla z20.s, p3/M, z4.s, z10.s\n"
+    "fmla z27.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z9.s }, p2/Z, [x6, x13, LSL #2]\n"
+    "fmla z22.s, p3/M, z2.s, z10.s\n"
+    "fmla z23.s, p3/M, z1.s, z10.s\n"
+    "fmla z28.s, p3/M, z8.s, z10.s\n"
+    "fmla z29.s, p3/M, z7.s, z10.s\n"
+    "fmla z31.s, p3/M, z5.s, z10.s\n"
+    "fmla z30.s, p3/M, z0.s, z17.s\n"
+    "ld1w { z19.s }, p2/Z, [x16, x8, LSL #2]\n"
+    "fmla z20.s, p3/M, z2.s, z9.s\n"
+    "fmla z21.s, p3/M, z3.s, z16.s\n"
+    "fmla z22.s, p3/M, z4.s, z19.s\n"
+    "fmla z23.s, p3/M, z3.s, z19.s\n"
+    "fmla z27.s, p3/M, z3.s, z17.s\n"
+    "fmla z29.s, p3/M, z5.s, z9.s\n"
+    "ld1w { z17.s }, p2/Z, [x16, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z6.s, z16.s\n"
+    "fmla z31.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+    "fmla z20.s, p3/M, z6.s, z19.s\n"
+    "fmla z21.s, p3/M, z5.s, z19.s\n"
+    "ld1w { z18.s }, p2/Z, [x6, x4, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z17.s\n"
+    "fmla z22.s, p3/M, z6.s, z16.s\n"
+    "fmla z30.s, p3/M, z8.s, z19.s\n"
+    "fmla z20.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x6, x15, LSL #2]\n"
+    "addvl x6, x6, #1\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "fmla z28.s, p3/M, z3.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+    "fmla z31.s, p3/M, z0.s, z18.s\n"
+    "fmla z27.s, p3/M, z4.s, z18.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "fmla z23.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x16, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z18.s\n"
+    "fmla z28.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
+    "addvl x16, x16, #1\n"
+    "fmla z29.s, p3/M, z4.s, z17.s\n"
+    "fmla z31.s, p3/M, z2.s, z17.s\n"
+    "fmla z20.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
+    "fmla z21.s, p3/M, z4.s, z11.s\n"
+    "addvl x5, x5, #1\n"
+    "fmla z22.s, p3/M, z3.s, z11.s\n"
+    "fmla z27.s, p3/M, z2.s, z16.s\n"
+    "ld1w { z10.s }, p1/Z, [x5]\n"
+    "fmla z23.s, p3/M, z4.s, z19.s\n"
+    "fmla z30.s, p3/M, z7.s, z11.s\n"
+    "fmla z31.s, p3/M, z6.s, z11.s\n"
+    "fmla z28.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x7]\n"
+    "fmla z29.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x7, x13, LSL #2]\n"
+    "fmla z20.s, p3/M, z7.s, z19.s\n"
+    "addvl x7, x7, #1\n"
+    "fmla z22.s, p3/M, z5.s, z19.s\n"
+    "fmla z27.s, p3/M, z6.s, z18.s\n"
+    "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
+    "fmla z21.s, p3/M, z0.s, z18.s\n"
+    "fmla z23.s, p3/M, z2.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x8, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z18.s\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "fmla z29.s, p3/M, z8.s, z17.s\n"
+    "fmla z20.s, p3/M, z5.s, z17.s\n"
+    ".inst 0xa040c220  // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "fmla z21.s, p3/M, z8.s, z16.s\n"
+    "fmla z22.s, p3/M, z7.s, z16.s\n"
+    "addvl x14, x14, #1\n"
+    "cmp x25, %x[n_channels]\n"
+    "fmla z23.s, p3/M, z6.s, z16.s\n"
+    "fmax z27.s, p3/M, z27.s, z26.s\n"
+    ".inst 0xa040c224  // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "fmin z27.s, p3/M, z27.s, z14.s\n"
+    ".inst 0xc1aecb5c  // fclamp { z28.s-z31.s }, z26.s, z14.s\n"
+    "ld1w { z11.s }, p1/Z, [x5, x13, LSL #2]\n"
+    ".inst 0xc1aecb54  // fclamp { z20.s-z23.s }, z26.s, z14.s\n"
+    "ld1w { z12.s }, p1/Z, [x14]\n"
+    "st1w { z27.s }, p0, [x26]\n"
+    "ld1w { z13.s }, p1/Z, [x6, x8, LSL #2]\n"
+    "st1w { z28.s }, p0, [x26, x27, LSL #2]\n"
+    "st1w { z29.s }, p0, [x26, x22, LSL #2]\n"
+    "addvl x26, x26, #1\n"
+    "ld1w { z8.s }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "st1w { z30.s }, p0, [x24]\n"
+    "st1w { z31.s }, p0, [x24, x27, LSL #2]\n"
+    "st1w { z20.s }, p0, [x24, x22, LSL #2]\n"
+    "addvl x24, x24, #1\n"
+    "st1w { z21.s }, p0, [x23]\n"
+    "st1w { z22.s }, p0, [x23, x27, LSL #2]\n"
+    "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
+    "addvl x23, x23, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z28, z24\n fmla z28.s, p3/M, z7.s, z9.s\n"
+    "movprfx z25, z24\n fmla z25.s, p3/M, z8.s, z9.s\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "add x3, x3, #0x1\n"
+    "movprfx z29, z24\n fmla z29.s, p3/M, z6.s, z9.s\n"
+    "fmla z28.s, p3/M, z4.s, z13.s\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x2, #0x1\n"
+    "movprfx z30, z24\n fmla z30.s, p3/M, z5.s, z9.s\n"
+    "movprfx z31, z24\n fmla z31.s, p3/M, z4.s, z9.s\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x3, x20\n"
+    "movprfx z20, z24\n fmla z20.s, p3/M, z3.s, z9.s\n"
+    "fmla z25.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z27.s }, p2/Z, [x7, x15, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z17.s }, p2/Z, [x7, x4, LSL #2]\n"
+    "movprfx z21, z24\n fmla z21.s, p3/M, z2.s, z9.s\n"
+    "csel x2, x2, x21, LT\n"
+    "fmla z28.s, p3/M, z6.s, z17.s\n"
+    "movprfx z23, z24\n fmla z23.s, p3/M, z0.s, z9.s\n"
+    "mov p0.b, p2.b\n"
+    "csel x3, x3, XZR, LT\n"
+    "fmla z25.s, p3/M, z5.s, z13.s\n"
+    "fmla z29.s, p3/M, z3.s, z13.s\n"
+    "cmp x2, x20\n"
+    "fmla z30.s, p3/M, z2.s, z13.s\n"
+    "fmla z31.s, p3/M, z1.s, z13.s\n"
+    "fmla z20.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z19.s }, p2/Z, [x5, x4, LSL #2]\n"
+    "fmla z21.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+    "movprfx z22, z24\n fmla z22.s, p3/M, z1.s, z9.s\n"
+    "fmla z28.s, p3/M, z0.s, z19.s\n"
+    "fmla z23.s, p3/M, z8.s, z16.s\n"
+    "fmla z25.s, p3/M, z7.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x5, x15, LSL #2]\n"
+    "fmla z22.s, p3/M, z0.s, z17.s\n"
+    "fmla z30.s, p3/M, z4.s, z17.s\n"
+    "fmla z31.s, p3/M, z3.s, z17.s\n"
+    "fmla z21.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x6]\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z29.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x16]\n"
+    "fmla z20.s, p3/M, z4.s, z27.s\n"
+    "fmla z25.s, p3/M, z1.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x6, x13, LSL #2]\n"
+    "fmla z22.s, p3/M, z2.s, z27.s\n"
+    "fmla z23.s, p3/M, z1.s, z27.s\n"
+    "fmla z28.s, p3/M, z8.s, z27.s\n"
+    "fmla z29.s, p3/M, z7.s, z27.s\n"
+    "fmla z31.s, p3/M, z5.s, z27.s\n"
+    "fmla z30.s, p3/M, z0.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x16, x8, LSL #2]\n"
+    "fmla z20.s, p3/M, z2.s, z16.s\n"
+    "fmla z21.s, p3/M, z3.s, z17.s\n"
+    "fmla z22.s, p3/M, z4.s, z19.s\n"
+    "fmla z23.s, p3/M, z3.s, z19.s\n"
+    "fmla z25.s, p3/M, z3.s, z18.s\n"
+    "fmla z29.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x16, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z6.s, z17.s\n"
+    "fmla z31.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+    "fmla z20.s, p3/M, z6.s, z19.s\n"
+    "fmla z21.s, p3/M, z5.s, z19.s\n"
+    "ld1w { z17.s }, p2/Z, [x6, x4, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z18.s\n"
+    "fmla z22.s, p3/M, z6.s, z16.s\n"
+    "fmla z30.s, p3/M, z8.s, z19.s\n"
+    "fmla z20.s, p3/M, z8.s, z18.s\n"
+    "ld1w { z18.s }, p2/Z, [x6, x15, LSL #2]\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "fmla z28.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+    "fmla z31.s, p3/M, z0.s, z17.s\n"
+    "fmla z25.s, p3/M, z4.s, z17.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "fmla z23.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x16, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z17.s\n"
+    "fmla z28.s, p3/M, z5.s, z18.s\n"
+    "ld1w { z17.s }, p2/Z, [x16, x4, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z18.s\n"
+    "fmla z31.s, p3/M, z2.s, z18.s\n"
+    "fmla z20.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
+    "fmla z21.s, p3/M, z4.s, z17.s\n"
+    "fmla z22.s, p3/M, z3.s, z17.s\n"
+    "fmla z25.s, p3/M, z2.s, z16.s\n"
+    "fmla z23.s, p3/M, z4.s, z19.s\n"
+    "fmla z30.s, p3/M, z7.s, z17.s\n"
+    "fmla z31.s, p3/M, z6.s, z17.s\n"
+    "fmla z28.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x7]\n"
+    "fmla z29.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x7, x13, LSL #2]\n"
+    "fmla z20.s, p3/M, z7.s, z19.s\n"
+    "fmla z22.s, p3/M, z5.s, z19.s\n"
+    "fmla z25.s, p3/M, z6.s, z18.s\n"
+    "fmla z21.s, p3/M, z0.s, z18.s\n"
+    "fmla z23.s, p3/M, z2.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x8, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z18.s\n"
+    "fmla z29.s, p3/M, z8.s, z17.s\n"
+    "fmla z20.s, p3/M, z5.s, z17.s\n"
+    "fmla z21.s, p3/M, z8.s, z16.s\n"
+    "fmla z22.s, p3/M, z7.s, z16.s\n"
+    "fmla z23.s, p3/M, z6.s, z16.s\n"
+    "fmax z25.s, p3/M, z25.s, z26.s\n"
+    "fmin z25.s, p3/M, z25.s, z14.s\n"
+    ".inst 0xc1aecb5c  // fclamp { z28.s-z31.s }, z26.s, z14.s\n"
+    "st1w { z25.s }, p0, [x26]\n"
+    ".inst 0xc1aecb54  // fclamp { z20.s-z23.s }, z26.s, z14.s\n"
+    "st1w { z28.s }, p0, [x26, x27, LSL #2]\n"
+    "st1w { z29.s }, p0, [x26, x22, LSL #2]\n"
+    "st1w { z30.s }, p0, [x24]\n"
+    "st1w { z31.s }, p0, [x24, x27, LSL #2]\n"
+    "st1w { z20.s }, p0, [x24, x22, LSL #2]\n"
+    "st1w { z21.s }, p0, [x23]\n"
+    "st1w { z22.s }, p0, [x23, x27, LSL #2]\n"
+    "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..2c868b6cf3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[25];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[4];
+      inptrs[3] = input_ptrs[20];
+      inptrs[4] = input_ptrs[7];
+      inptrs[5] = input_ptrs[24];
+      inptrs[6] = input_ptrs[11];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[3];
+      inptrs[9] = input_ptrs[13];
+      inptrs[10] = input_ptrs[5];
+      inptrs[11] = input_ptrs[9];
+      inptrs[12] = input_ptrs[15];
+      inptrs[13] = input_ptrs[17];
+      inptrs[14] = input_ptrs[19];
+      inptrs[15] = input_ptrs[21];
+      inptrs[16] = input_ptrs[6];
+      inptrs[17] = input_ptrs[8];
+      inptrs[18] = input_ptrs[23];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[18];
+      inptrs[22] = input_ptrs[10];
+      inptrs[23] = input_ptrs[14];
+      inptrs[24] = input_ptrs[22];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "ld1w { z20.s }, p3/Z, [x8]\n"
+    "addvl x8, x8, #1\n"
+    "ldp x24, x23, [x17, #0x0]\n"
+    "ldp x22, x21, [x17, #0x10]\n"
+    "cntw x16\n"
+    ".inst 0xa040c100  // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "ldr x20, [x17, #0x20]\n"
+    "mov x15, #0x0\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    ".inst 0xa040c104  // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "cmp x16, %x[n_channels]\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ld1rw { z22.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "sub x13, XZR, x16\n"
+    "ld1w { z8.s }, p3/Z, [x8]\n"
+    "addvl x8, x8, #1\n"
+    "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z21, z20\n fmla z21.s, p3/M, z8.s, z9.s\n"
+    "movprfx z24, z20\n fmla z24.s, p3/M, z7.s, z9.s\n"
+    "ldr x22, [x17, #0x30]\n"
+    "incw x13\n"
+    "movprfx z25, z20\n fmla z25.s, p3/M, z6.s, z9.s\n"
+    "fmla z21.s, p3/M, z0.s, z10.s\n"
+    "ldr x25, [x17, #0x38]\n"
+    "mov p1.b, p2.b\n"
+    "fmla z24.s, p3/M, z4.s, z13.s\n"
+    "movprfx z26, z20\n fmla z26.s, p3/M, z5.s, z9.s\n"
+    "ldr x21, [x17, #0x28]\n"
+    "whilelt p0.s, x16, %x[n_channels]\n"
+    "movprfx z27, z20\n fmla z27.s, p3/M, z4.s, z9.s\n"
+    "movprfx z28, z20\n fmla z28.s, p3/M, z3.s, z9.s\n"
+    "ldr x20, [x17, #0x48]\n"
+    "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "fmla z25.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z23.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "movprfx z29, z20\n fmla z29.s, p3/M, z2.s, z9.s\n"
+    "ldr x20, [x17, #0x40]\n"
+    "fmla z21.s, p3/M, z5.s, z13.s\n"
+    "fmla z24.s, p3/M, z6.s, z23.s\n"
+    "ldr x24, [x17, #0x50]\n"
+    "movprfx z31, z20\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "fmla z25.s, p3/M, z3.s, z13.s\n"
+    "ldr x23, [x17, #0x58]\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "fmla z27.s, p3/M, z1.s, z13.s\n"
+    "ldr x22, [x17, #0x60]\n"
+    "fmla z28.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z6.s, z12.s\n"
+    "ldr x12, [x17, #0x70]\n"
+    "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "movprfx z30, z20\n fmla z30.s, p3/M, z1.s, z9.s\n"
+    "fmla z21.s, p3/M, z7.s, z23.s\n"
+    "ldr x21, [x17, #0x68]\n"
+    "fmla z24.s, p3/M, z0.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x27, [x17, #0x78]\n"
+    "fmla z26.s, p3/M, z4.s, z23.s\n"
+    "fmla z27.s, p3/M, z3.s, z23.s\n"
+    "ldr x20, [x17, #0x80]\n"
+    "ld1w { z20.s }, p3/Z, [x8]\n"
+    "fmla z30.s, p3/M, z0.s, z23.s\n"
+    "fmla z28.s, p3/M, z4.s, z19.s\n"
+    "ldr x11, [x17, #0x88]\n"
+    "addvl x8, x8, #1\n"
+    "fmla z29.s, p3/M, z1.s, z23.s\n"
+    "fmla z21.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x26, [x17, #0x90]\n"
+    "fmla z24.s, p3/M, z2.s, z16.s\n"
+    "fmla z25.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z11.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x25, [x17, #0x98]\n"
+    "ld1w { z17.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z5.s, z19.s\n"
+    "fmla z30.s, p3/M, z2.s, z19.s\n"
+    "ldr x24, [x17, #0xa0]\n"
+    "fmla z26.s, p3/M, z0.s, z18.s\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "ldr x10, [x14, #0x0]\n"
+    "fmla z24.s, p3/M, z8.s, z19.s\n"
+    "fmla z25.s, p3/M, z7.s, z19.s\n"
+    "ldr x9, [x14, #0x8]\n"
+    "fmla z31.s, p3/M, z1.s, z19.s\n"
+    "fmla z29.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x23, [x17, #0xa8]\n"
+    "fmla z26.s, p3/M, z6.s, z17.s\n"
+    "fmla z27.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z23.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x22, [x17, #0xc0]\n"
+    "fmla z28.s, p3/M, z6.s, z16.s\n"
+    "fmla z30.s, p3/M, z4.s, z16.s\n"
+    "ldr x28, [x14, #0x10]\n"
+    "fmla z21.s, p3/M, z3.s, z18.s\n"
+    "fmla z25.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z15.s }, p2/Z, [x12, x15, LSL #2]\n"
+    "ldr x21, [x17, #0xb0]\n"
+    "fmla z29.s, p3/M, z5.s, z16.s\n"
+    "fmla z31.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "ldr x20, [x17, #0xb8]\n"
+    "fmla z26.s, p3/M, z8.s, z16.s\n"
+    "fmla z28.s, p3/M, z8.s, z15.s\n"
+    "ldr x27, [x14, #0x18]\n"
+    "fmla z30.s, p3/M, z6.s, z19.s\n"
+    "fmla z24.s, p3/M, z3.s, z23.s\n"
+    "fmla z27.s, p3/M, z0.s, z23.s\n"
+    "fmla z31.s, p3/M, z5.s, z15.s\n"
+    "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z19.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "fmla z21.s, p3/M, z4.s, z23.s\n"
+    "fmla z26.s, p3/M, z1.s, z23.s\n"
+    "fmla z24.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z25.s, p3/M, z4.s, z17.s\n"
+    "fmla z27.s, p3/M, z2.s, z17.s\n"
+    "fmla z28.s, p3/M, z1.s, z17.s\n"
+    "fmla z30.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x26, [x17, #0x20]\n"
+    "fmla z21.s, p3/M, z2.s, z17.s\n"
+    "fmla z26.s, p3/M, z7.s, z16.s\n"
+    "fmla z27.s, p3/M, z6.s, z16.s\n"
+    "fmla z29.s, p3/M, z4.s, z16.s\n"
+    "fmla z30.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "fmla z31.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z21.s, p3/M, z6.s, z18.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "fmla z24.s, p3/M, z1.s, z17.s\n"
+    "fmla z25.s, p3/M, z0.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "fmax z21.s, p3/M, z21.s, z22.s\n"
+    "fmla z28.s, p3/M, z7.s, z16.s\n"
+    "fmla z30.s, p3/M, z5.s, z16.s\n"
+    "fmla z29.s, p3/M, z0.s, z18.s\n"
+    "fmla z31.s, p3/M, z2.s, z17.s\n"
+    "fmla z27.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldp x22, x21, [x17, #0x0]\n"
+    "fmla z26.s, p3/M, z3.s, z18.s\n"
+    "fmla z25.s, p3/M, z8.s, z17.s\n"
+    "ldp x25, x24, [x17, #0x10]\n"
+    "incw x15\n"
+    "fmin z21.s, p3/M, z21.s, z14.s\n"
+    "st1w { z21.s }, p1, [x10, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x20]\n"
+    "fmla z28.s, p3/M, z5.s, z17.s\n"
+    "fmla z29.s, p3/M, z8.s, z16.s\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z9.s }, p0/Z, [x22, x16, LSL #2]\n"
+    "whilelt p2.s, x15, %x[n_channels]\n"
+    "fmla z31.s, p3/M, z6.s, z16.s\n"
+    ".inst 0xc1aecad8  // fclamp { z24.s-z27.s }, z22.s, z14.s\n"
+    "st1w { z24.s }, p1, [x9, x13, LSL #2]\n"
+    "ldr x23, [x14, #0x28]\n"
+    "st1w { z25.s }, p1, [x28, x13, LSL #2]\n"
+    "ldr x22, [x14, #0x30]\n"
+    "ld1w { z10.s }, p0/Z, [x21, x16, LSL #2]\n"
+    ".inst 0xc1aecadc  // fclamp { z28.s-z31.s }, z22.s, z14.s\n"
+    "st1w { z26.s }, p1, [x27, x13, LSL #2]\n"
+    "ldr x21, [x14, #0x38]\n"
+    "ld1w { z11.s }, p0/Z, [x25, x16, LSL #2]\n"
+    "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x40]\n"
+    "ld1w { z12.s }, p0/Z, [x24, x16, LSL #2]\n"
+    "ld1w { z13.s }, p0/Z, [x26, x16, LSL #2]\n"
+    "incw x16\n"
+    "cmp x16, %x[n_channels]\n"
+    "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
+    ".inst 0xa040c100  // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
+    ".inst 0xa040c104  // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
+    "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
+    "ld1w { z8.s }, p3/Z, [x8]\n"
+    "addvl x8, x8, #1\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z21, z20\n fmla z21.s, p3/M, z8.s, z9.s\n"
+    "movprfx z24, z20\n fmla z24.s, p3/M, z7.s, z9.s\n"
+    "ldr x23, [x17, #0x30]\n"
+    "incw x13\n"
+    "movprfx z25, z20\n fmla z25.s, p3/M, z6.s, z9.s\n"
+    "fmla z21.s, p3/M, z0.s, z10.s\n"
+    "ldr x22, [x17, #0x38]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z24.s, p3/M, z4.s, z13.s\n"
+    "movprfx z26, z20\n fmla z26.s, p3/M, z5.s, z9.s\n"
+    "ldr x21, [x17, #0x28]\n"
+    "movprfx z27, z20\n fmla z27.s, p3/M, z4.s, z9.s\n"
+    "movprfx z28, z20\n fmla z28.s, p3/M, z3.s, z9.s\n"
+    "ldr x20, [x17, #0x48]\n"
+    "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "fmla z25.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "movprfx z29, z20\n fmla z29.s, p3/M, z2.s, z9.s\n"
+    "ldr x20, [x17, #0x40]\n"
+    "fmla z21.s, p3/M, z5.s, z13.s\n"
+    "fmla z24.s, p3/M, z6.s, z18.s\n"
+    "ldr x25, [x17, #0x50]\n"
+    "movprfx z31, z20\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "fmla z25.s, p3/M, z3.s, z13.s\n"
+    "ldr x24, [x17, #0x58]\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "fmla z27.s, p3/M, z1.s, z13.s\n"
+    "ldr x23, [x17, #0x60]\n"
+    "fmla z28.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z6.s, z12.s\n"
+    "ldr x12, [x17, #0x70]\n"
+    "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "movprfx z30, z20\n fmla z30.s, p3/M, z1.s, z9.s\n"
+    "fmla z21.s, p3/M, z7.s, z18.s\n"
+    "ldr x22, [x17, #0x68]\n"
+    "fmla z24.s, p3/M, z0.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x21, [x17, #0x78]\n"
+    "fmla z26.s, p3/M, z4.s, z18.s\n"
+    "fmla z27.s, p3/M, z3.s, z18.s\n"
+    "ldr x20, [x17, #0x80]\n"
+    "fmla z30.s, p3/M, z0.s, z18.s\n"
+    "fmla z28.s, p3/M, z4.s, z19.s\n"
+    "ldr x11, [x17, #0x88]\n"
+    "fmla z29.s, p3/M, z1.s, z18.s\n"
+    "fmla z21.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "ldr x10, [x17, #0x90]\n"
+    "fmla z24.s, p3/M, z2.s, z16.s\n"
+    "fmla z25.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x9, [x17, #0x98]\n"
+    "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z5.s, z19.s\n"
+    "fmla z30.s, p3/M, z2.s, z19.s\n"
+    "ldr x28, [x17, #0xa0]\n"
+    "fmla z26.s, p3/M, z0.s, z20.s\n"
+    "fmla z28.s, p3/M, z2.s, z17.s\n"
+    "ldr x27, [x14, #0x0]\n"
+    "fmla z24.s, p3/M, z8.s, z19.s\n"
+    "fmla z25.s, p3/M, z7.s, z19.s\n"
+    "ldr x26, [x14, #0x8]\n"
+    "fmla z31.s, p3/M, z1.s, z19.s\n"
+    "fmla z29.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldr x25, [x17, #0xa8]\n"
+    "fmla z26.s, p3/M, z6.s, z16.s\n"
+    "fmla z27.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z18.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x23, [x17, #0xc0]\n"
+    "fmla z28.s, p3/M, z6.s, z19.s\n"
+    "fmla z30.s, p3/M, z4.s, z19.s\n"
+    "ldr x24, [x14, #0x10]\n"
+    "fmla z21.s, p3/M, z3.s, z20.s\n"
+    "fmla z25.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
+    "ldr x22, [x17, #0xb0]\n"
+    "fmla z29.s, p3/M, z5.s, z19.s\n"
+    "fmla z31.s, p3/M, z3.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x20, [x17, #0xb8]\n"
+    "fmla z26.s, p3/M, z8.s, z19.s\n"
+    "fmla z28.s, p3/M, z8.s, z17.s\n"
+    "ldr x21, [x14, #0x18]\n"
+    "fmla z30.s, p3/M, z6.s, z16.s\n"
+    "fmla z24.s, p3/M, z3.s, z18.s\n"
+    "fmla z27.s, p3/M, z0.s, z18.s\n"
+    "fmla z31.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x10, x15, LSL #2]\n"
+    "fmla z21.s, p3/M, z4.s, z18.s\n"
+    "fmla z26.s, p3/M, z1.s, z18.s\n"
+    "fmla z24.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, x15, LSL #2]\n"
+    "fmla z25.s, p3/M, z4.s, z17.s\n"
+    "fmla z27.s, p3/M, z2.s, z17.s\n"
+    "fmla z28.s, p3/M, z1.s, z17.s\n"
+    "fmla z30.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z17.s }, p2/Z, [x28, x15, LSL #2]\n"
+    "fmla z21.s, p3/M, z2.s, z17.s\n"
+    "fmla z26.s, p3/M, z7.s, z16.s\n"
+    "fmla z27.s, p3/M, z6.s, z16.s\n"
+    "fmla z29.s, p3/M, z4.s, z16.s\n"
+    "fmla z30.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "fmla z31.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z21.s, p3/M, z6.s, z18.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "fmla z24.s, p3/M, z1.s, z17.s\n"
+    "fmla z25.s, p3/M, z0.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "fmax z21.s, p3/M, z21.s, z22.s\n"
+    "fmla z28.s, p3/M, z7.s, z16.s\n"
+    "fmla z30.s, p3/M, z5.s, z16.s\n"
+    "fmla z29.s, p3/M, z0.s, z18.s\n"
+    "fmla z31.s, p3/M, z2.s, z17.s\n"
+    "fmla z27.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z26.s, p3/M, z3.s, z18.s\n"
+    "fmla z25.s, p3/M, z8.s, z17.s\n"
+    "fmin z21.s, p3/M, z21.s, z14.s\n"
+    "st1w { z21.s }, p0, [x27, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x20]\n"
+    "fmla z28.s, p3/M, z5.s, z17.s\n"
+    "fmla z29.s, p3/M, z8.s, z16.s\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "fmla z31.s, p3/M, z6.s, z16.s\n"
+    ".inst 0xc1aecad8  // fclamp { z24.s-z27.s }, z22.s, z14.s\n"
+    "st1w { z24.s }, p0, [x26, x13, LSL #2]\n"
+    "ldr x23, [x14, #0x28]\n"
+    "st1w { z25.s }, p0, [x24, x13, LSL #2]\n"
+    "ldr x22, [x14, #0x30]\n"
+    ".inst 0xc1aecadc  // fclamp { z28.s-z31.s }, z22.s, z14.s\n"
+    "st1w { z26.s }, p0, [x21, x13, LSL #2]\n"
+    "ldr x21, [x14, #0x38]\n"
+    "st1w { z27.s }, p0, [x20, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x40]\n"
+    "st1w { z28.s }, p0, [x23, x13, LSL #2]\n"
+    "st1w { z29.s }, p0, [x22, x13, LSL #2]\n"
+    "st1w { z30.s }, p0, [x21, x13, LSL #2]\n"
+    "st1w { z31.s }, p0, [x20, x13, LSL #2]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..add666e14e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 4;
+
+  sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..efd37c38ec
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,672 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "mov x2, #0x0\n"
+    "mov x3, #0x0\n"
+    "1:"  // Tile loop
+    "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x4\n"
+    "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "mul x20, x2, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "madd x20, x3, x4, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "add x5, x5, x20, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x6, x5, x21, LSL #2\n"
+    "add x7, x6, x21, LSL #2\n"
+    "add x8, x4, x4\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x16, x7, x21, LSL #2\n"
+    "add x15, x8, x4\n"
+    "add x14, x16, x21, LSL #2\n"
+    "add x13, x15, x4\n"
+    "add x12, x14, x21, LSL #2\n"
+    "add x11, x13, x4\n"
+    "cbnz x3, 2f\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "sub x21, x20, x3\n"
+    "sub x21, x21, #0x1\n"
+    "lsl x10, %x[n_channels], #0x2\n"
+    "mov x20, #0x10\n"
+    "and x21, x21, #0x3fffff\n"
+    "mul x20, x20, x4\n"
+    "orr x10, x10, x21, LSL #22\n"
+    "orr x10, x10, x20, LSL #38\n"
+    "add x9, x7, x8, LSL #2\n"
+    "add x28, x5, x11, LSL #2\n"
+    "add x27, x7, x15, LSL #2\n"
+    "add x26, x12, x11, LSL #2\n"
+    "add x25, x16, x8, LSL #2\n"
+    "add x24, x5, x4, LSL #2\n"
+    "add x23, x5, x13, LSL #2\n"
+    "add x22, x16, x15, LSL #2\n"
+    "add x21, x6, x11, LSL #2\n"
+    "add x20, x6, x8, LSL #2\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    "add x9, x14, x11, LSL #2\n"
+    ".inst 0xf8aa48ba  // rprfm pldonce, x10, [x5]\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    "add x28, x6, x15, LSL #2\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    "add x27, x12, x4, LSL #2\n"
+    ".inst 0xf8aa499a  // rprfm pldonce, x10, [x12]\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    "add x26, x7, x4, LSL #2\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    "add x25, x12, x13, LSL #2\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    "add x24, x7, x13, LSL #2\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    "add x23, x5, x8, LSL #2\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    "add x22, x16, x4, LSL #2\n"
+    ".inst 0xf8aa48da  // rprfm pldonce, x10, [x6]\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    "add x21, x5, x15, LSL #2\n"
+    ".inst 0xf8aa49da  // rprfm pldonce, x10, [x14]\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "add x20, x16, x13, LSL #2\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    "add x9, x7, x11, LSL #2\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    "add x28, x14, x8, LSL #2\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    "add x27, x16, x11, LSL #2\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    "add x26, x12, x8, LSL #2\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    "add x25, x14, x15, LSL #2\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    "add x24, x12, x15, LSL #2\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    "add x23, x6, x4, LSL #2\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    "add x22, x6, x13, LSL #2\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    "add x21, x14, x4, LSL #2\n"
+    ".inst 0xf8aa48fa  // rprfm pldonce, x10, [x7]\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "add x20, x14, x13, LSL #2\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    ".inst 0xf8aa4a1a  // rprfm pldonce, x10, [x16]\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x2, x22\n"  // offset = tile_i * ld_output_row
+    "mov x20, #0x4\n"
+    "ld1w { z14.s }, p3/Z, [x17]\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "madd x21, x3, x9, x21\n"  // offset += tile_j * ld_output_col
+    "mul x21, x21, x20\n"  // offset *= output_tile_size
+    "ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x28, x28, x21, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "addvl x17, x17, #1\n"
+    ".inst 0xa040c220  // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+    "add x27, x28, x22, LSL #2\n"
+    "cntw x26\n"
+    "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "addvl x17, x17, #4\n"
+    "add x25, x27, x22, LSL #2\n"
+    ".inst 0xa040c224  // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+    "add x24, x9, x9\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z9.s }, p2/Z, [x7, x8, LSL #2]\n"
+    "addvl x17, x17, #4\n"
+    "cmp x26, %x[n_channels]\n"
+    "ld1w { z8.s }, p3/Z, [x17]\n"
+    "add x23, x25, x22, LSL #2\n"
+    "add x22, x24, x9\n"
+    "ld1w { z10.s }, p2/Z, [x5]\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x26\n"
+    "ld1w { z11.s }, p2/Z, [x5, x11, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x7, x15, LSL #2]\n"
+    "addvl x17, x17, #1\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+    "movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
+    "whilelt p1.s, x26, %x[n_channels]\n"
+    "incw x21\n"
+    "movprfx z26, z14\n fmla z26.s, p3/M, z3.s, z9.s\n"
+    "movprfx z17, z14\n fmla z17.s, p3/M, z1.s, z9.s\n"
+    "incw x26\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z18, z14\n fmla z18.s, p3/M, z0.s, z9.s\n"
+    "fmla z25.s, p3/M, z5.s, z12.s\n"
+    "incw x20\n"
+    "movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
+    "movprfx z30, z14\n fmla z30.s, p3/M, z6.s, z9.s\n"
+    "movprfx z24, z14\n fmla z24.s, p3/M, z5.s, z9.s\n"
+    "movprfx z16, z14\n fmla z16.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x16, x8, LSL #2]\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "movprfx z31, z14\n fmla z31.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z19.s }, p2/Z, [x12]\n"
+    "fmla z26.s, p3/M, z4.s, z12.s\n"
+    "fmla z17.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z22.s }, p2/Z, [x12, x11, LSL #2]\n"
+    "fmla z18.s, p3/M, z1.s, z12.s\n"
+    "movprfx z20, z14\n fmla z20.s, p3/M, z6.s, z19.s\n"
+    "ld1w { z11.s }, p2/Z, [x16, x15, LSL #2]\n"
+    "fmla z25.s, p3/M, z7.s, z9.s\n"
+    "fmla z29.s, p3/M, z8.s, z12.s\n"
+    "fmla z30.s, p3/M, z7.s, z12.s\n"
+    "fmla z31.s, p3/M, z6.s, z12.s\n"
+    "movprfx z27, z14\n fmla z27.s, p3/M, z3.s, z12.s\n"
+    "movprfx z19, z14\n fmla z19.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x5, x4, LSL #2]\n"
+    "movprfx z23, z14\n fmla z23.s, p3/M, z8.s, z22.s\n"
+    "fmla z26.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x5, x13, LSL #2]\n"
+    "fmla z17.s, p3/M, z4.s, z9.s\n"
+    "fmla z18.s, p3/M, z3.s, z9.s\n"
+    "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
+    "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z14.s }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "fmla z24.s, p3/M, z8.s, z9.s\n"
+    "fmla z16.s, p3/M, z5.s, z9.s\n"
+    "fmla z20.s, p3/M, z2.s, z9.s\n"
+    "fmla z25.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z9.s }, p2/Z, [x6]\n"
+    "fmla z28.s, p3/M, z1.s, z10.s\n"
+    "fmla z29.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x6, x11, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x14]\n"
+    "fmla z26.s, p3/M, z7.s, z11.s\n"
+    "fmla z27.s, p3/M, z6.s, z11.s\n"
+    "fmla z17.s, p3/M, z5.s, z11.s\n"
+    "fmla z18.s, p3/M, z4.s, z11.s\n"
+    "fmla z19.s, p3/M, z3.s, z11.s\n"
+    "fmla z21.s, p3/M, z2.s, z11.s\n"
+    "fmla z22.s, p3/M, z1.s, z11.s\n"
+    "fmla z23.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x6, x8, LSL #2]\n"
+    "fmla z24.s, p3/M, z0.s, z9.s\n"
+    "fmla z16.s, p3/M, z6.s, z12.s\n"
+    "fmla z20.s, p3/M, z3.s, z12.s\n"
+    "fmla z25.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z12.s }, p2/Z, [x14, x11, LSL #2]\n"
+    "fmla z28.s, p3/M, z3.s, z9.s\n"
+    "fmla z31.s, p3/M, z5.s, z10.s\n"
+    "fmla z27.s, p3/M, z2.s, z10.s\n"
+    "fmla z29.s, p3/M, z4.s, z11.s\n"
+    "ld1w { z10.s }, p2/Z, [x6, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z11.s\n"
+    "fmla z26.s, p3/M, z0.s, z11.s\n"
+    "fmla z19.s, p3/M, z8.s, z12.s\n"
+    "fmla z23.s, p3/M, z5.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x12, x4, LSL #2]\n"
+    "fmla z24.s, p3/M, z2.s, z11.s\n"
+    "fmla z25.s, p3/M, z2.s, z10.s\n"
+    "fmla z28.s, p3/M, z5.s, z11.s\n"
+    "fmla z29.s, p3/M, z5.s, z10.s\n"
+    "ld1w { z9.s }, p2/Z, [x7, x4, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "fmla z31.s, p3/M, z3.s, z10.s\n"
+    "fmla z26.s, p3/M, z1.s, z10.s\n"
+    "fmla z27.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x7, x13, LSL #2]\n"
+    "fmla z20.s, p3/M, z7.s, z12.s\n"
+    "fmla z21.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x13, LSL #2]\n"
+    "fmla z24.s, p3/M, z4.s, z9.s\n"
+    "fmla z25.s, p3/M, z3.s, z9.s\n"
+    "fmla z16.s, p3/M, z1.s, z9.s\n"
+    "fmla z17.s, p3/M, z0.s, z9.s\n"
+    "fmla z28.s, p3/M, z7.s, z9.s\n"
+    "fmla z29.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x5, x8, LSL #2]\n"
+    "fmla z22.s, p3/M, z8.s, z11.s\n"
+    "fmla z23.s, p3/M, z7.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z10.s\n"
+    "fmla z31.s, p3/M, z7.s, z10.s\n"
+    "fmla z26.s, p3/M, z5.s, z10.s\n"
+    "fmla z27.s, p3/M, z4.s, z10.s\n"
+    "fmla z18.s, p3/M, z2.s, z10.s\n"
+    "fmla z19.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z9.s }, p2/Z, [x5, x15, LSL #2]\n"
+    "addvl x5, x5, #1\n"
+    "fmla z24.s, p3/M, z7.s, z11.s\n"
+    "fmla z25.s, p3/M, z6.s, z11.s\n"
+    "fmla z16.s, p3/M, z4.s, z11.s\n"
+    "fmla z17.s, p3/M, z3.s, z11.s\n"
+    "fmla z20.s, p3/M, z1.s, z11.s\n"
+    "fmla z21.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x16, x13, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z12.s\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "fmla z30.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x7]\n"
+    "fmla z22.s, p3/M, z2.s, z11.s\n"
+    "fmla z31.s, p3/M, z0.s, z9.s\n"
+    "fmla z24.s, p3/M, z3.s, z10.s\n"
+    "fmla z16.s, p3/M, z0.s, z10.s\n"
+    "fmla z26.s, p3/M, z8.s, z11.s\n"
+    "fmla z27.s, p3/M, z7.s, z11.s\n"
+    "fmla z18.s, p3/M, z5.s, z11.s\n"
+    "fmla z19.s, p3/M, z4.s, z11.s\n"
+    "fmla z23.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x14, x8, LSL #2]\n"
+    "fmla z29.s, p3/M, z2.s, z9.s\n"
+    "fmla z30.s, p3/M, z1.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x7, x11, LSL #2]\n"
+    "addvl x7, x7, #1\n"
+    "fmla z28.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x16]\n"
+    "fmla z21.s, p3/M, z4.s, z11.s\n"
+    "fmla z22.s, p3/M, z3.s, z11.s\n"
+    "fmla z31.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
+    "fmla z27.s, p3/M, z5.s, z12.s\n"
+    "fmla z19.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x16, x11, LSL #2]\n"
+    "addvl x16, x16, #1\n"
+    "fmla z24.s, p3/M, z6.s, z10.s\n"
+    "fmla z16.s, p3/M, z3.s, z10.s\n"
+    "fmla z20.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x12, x8, LSL #2]\n"
+    "fmla z23.s, p3/M, z2.s, z12.s\n"
+    "fmla z21.s, p3/M, z7.s, z10.s\n"
+    "fmla z22.s, p3/M, z6.s, z10.s\n"
+    "fmla z16.s, p3/M, z8.s, z11.s\n"
+    "fmla z17.s, p3/M, z7.s, z11.s\n"
+    "fmla z18.s, p3/M, z6.s, z11.s\n"
+    "fmla z20.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x14, x15, LSL #2]\n"
+    "fmla z19.s, p3/M, z5.s, z12.s\n"
+    "fmla z21.s, p3/M, z5.s, z11.s\n"
+    "fmla z22.s, p3/M, z4.s, z11.s\n"
+    "fmla z23.s, p3/M, z3.s, z11.s\n"
+    "fmla z27.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x12, x15, LSL #2]\n"
+    "fmla z20.s, p3/M, z8.s, z10.s\n"
+    "addvl x12, x12, #1\n"
+    "ld1w { z10.s }, p2/Z, [x6, x4, LSL #2]\n"
+    "fmla z17.s, p3/M, z8.s, z11.s\n"
+    "fmla z18.s, p3/M, z7.s, z11.s\n"
+    "fmla z19.s, p3/M, z6.s, z11.s\n"
+    "fmla z21.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z11.s }, p2/Z, [x6, x13, LSL #2]\n"
+    "addvl x6, x6, #1\n"
+    "fmla z22.s, p3/M, z7.s, z12.s\n"
+    "fmla z23.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x14, x4, LSL #2]\n"
+    "fmla z28.s, p3/M, z4.s, z10.s\n"
+    "fmla z29.s, p3/M, z3.s, z10.s\n"
+    "fmla z24.s, p3/M, z1.s, z10.s\n"
+    "fmla z25.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x14, x13, LSL #2]\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "fmla z30.s, p3/M, z5.s, z11.s\n"
+    "fmla z31.s, p3/M, z4.s, z11.s\n"
+    "cmp x26, %x[n_channels]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z26.s, p3/M, z2.s, z11.s\n"
+    "fmla z27.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p1/Z, [x5, x11, LSL #2]\n"
+    "fmla z16.s, p3/M, z7.s, z12.s\n"
+    "fmla z17.s, p3/M, z6.s, z12.s\n"
+    "fmla z20.s, p3/M, z4.s, z12.s\n"
+    "fmla z21.s, p3/M, z3.s, z12.s\n"
+    ".inst 0xa040c220  // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "fmla z18.s, p3/M, z8.s, z10.s\n"
+    "fmla z19.s, p3/M, z7.s, z10.s\n"
+    "ld1w { z12.s }, p1/Z, [x7, x15, LSL #2]\n"
+    "fmla z22.s, p3/M, z5.s, z10.s\n"
+    "fmla z23.s, p3/M, z4.s, z10.s\n"
+    ".inst 0xa040c224  // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    ".inst 0xc1afc9bc  // fclamp { z28.s-z31.s }, z13.s, z15.s\n"
+    ".inst 0xc1afc9b8  // fclamp { z24.s-z27.s }, z13.s, z15.s\n"
+    "ld1w { z10.s }, p1/Z, [x5]\n"
+    ".inst 0xc1afc9b0  // fclamp { z16.s-z19.s }, z13.s, z15.s\n"
+    ".inst 0xc1afc9b4  // fclamp { z20.s-z23.s }, z13.s, z15.s\n"
+    "st1w { z28.s }, p0, [x28]\n"
+    "st1w { z29.s }, p0, [x28, x9, LSL #2]\n"
+    "ld1w { z8.s }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "st1w { z30.s }, p0, [x28, x24, LSL #2]\n"
+    "st1w { z31.s }, p0, [x28, x22, LSL #2]\n"
+    "addvl x28, x28, #1\n"
+    "st1w { z24.s }, p0, [x27]\n"
+    "st1w { z25.s }, p0, [x27, x9, LSL #2]\n"
+    "st1w { z26.s }, p0, [x27, x24, LSL #2]\n"
+    "st1w { z27.s }, p0, [x27, x22, LSL #2]\n"
+    "addvl x27, x27, #1\n"
+    "st1w { z16.s }, p0, [x25]\n"
+    "st1w { z17.s }, p0, [x25, x9, LSL #2]\n"
+    "st1w { z18.s }, p0, [x25, x24, LSL #2]\n"
+    "st1w { z19.s }, p0, [x25, x22, LSL #2]\n"
+    "addvl x25, x25, #1\n"
+    "st1w { z20.s }, p0, [x23]\n"
+    "st1w { z21.s }, p0, [x23, x9, LSL #2]\n"
+    "st1w { z22.s }, p0, [x23, x24, LSL #2]\n"
+    "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
+    "addvl x23, x23, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z21, z14\n fmla z21.s, p3/M, z4.s, z9.s\n"
+    "movprfx z24, z14\n fmla z24.s, p3/M, z8.s, z9.s\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "add x3, x3, #0x1\n"
+    "movprfx z22, z14\n fmla z22.s, p3/M, z3.s, z9.s\n"
+    "movprfx z29, z14\n fmla z29.s, p3/M, z1.s, z9.s\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x2, #0x1\n"
+    "movprfx z30, z14\n fmla z30.s, p3/M, z0.s, z9.s\n"
+    "fmla z21.s, p3/M, z5.s, z12.s\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x3, x20\n"
+    "movprfx z25, z14\n fmla z25.s, p3/M, z7.s, z9.s\n"
+    "movprfx z26, z14\n fmla z26.s, p3/M, z6.s, z9.s\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x2, x2, x21, LT\n"
+    "movprfx z20, z14\n fmla z20.s, p3/M, z5.s, z9.s\n"
+    "movprfx z28, z14\n fmla z28.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x16, x8, LSL #2]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "movprfx z27, z14\n fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z17.s }, p2/Z, [x12]\n"
+    "csel x3, x3, XZR, LT\n"
+    "fmla z22.s, p3/M, z4.s, z12.s\n"
+    "fmla z29.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z18.s }, p2/Z, [x12, x11, LSL #2]\n"
+    "cmp x2, x20\n"
+    "fmla z30.s, p3/M, z1.s, z12.s\n"
+    "movprfx z16, z14\n fmla z16.s, p3/M, z6.s, z17.s\n"
+    "ld1w { z11.s }, p2/Z, [x16, x15, LSL #2]\n"
+    "fmla z21.s, p3/M, z7.s, z9.s\n"
+    "fmla z25.s, p3/M, z8.s, z12.s\n"
+    "fmla z26.s, p3/M, z7.s, z12.s\n"
+    "fmla z27.s, p3/M, z6.s, z12.s\n"
+    "movprfx z23, z14\n fmla z23.s, p3/M, z3.s, z12.s\n"
+    "movprfx z31, z14\n fmla z31.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x5, x4, LSL #2]\n"
+    "movprfx z19, z14\n fmla z19.s, p3/M, z8.s, z18.s\n"
+    "fmla z22.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x5, x13, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z9.s\n"
+    "fmla z30.s, p3/M, z3.s, z9.s\n"
+    "movprfx z17, z14\n fmla z17.s, p3/M, z1.s, z9.s\n"
+    "movprfx z18, z14\n fmla z18.s, p3/M, z0.s, z9.s\n"
+    "fmla z20.s, p3/M, z8.s, z9.s\n"
+    "fmla z28.s, p3/M, z5.s, z9.s\n"
+    "fmla z16.s, p3/M, z2.s, z9.s\n"
+    "fmla z21.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z14.s }, p2/Z, [x6]\n"
+    "fmla z24.s, p3/M, z1.s, z10.s\n"
+    "fmla z25.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x6, x11, LSL #2]\n"
+    "fmla z26.s, p3/M, z2.s, z12.s\n"
+    "fmla z27.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x14]\n"
+    "fmla z22.s, p3/M, z7.s, z11.s\n"
+    "fmla z23.s, p3/M, z6.s, z11.s\n"
+    "fmla z29.s, p3/M, z5.s, z11.s\n"
+    "fmla z30.s, p3/M, z4.s, z11.s\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "fmla z17.s, p3/M, z2.s, z11.s\n"
+    "fmla z18.s, p3/M, z1.s, z11.s\n"
+    "fmla z19.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z9.s }, p2/Z, [x6, x8, LSL #2]\n"
+    "fmla z20.s, p3/M, z0.s, z14.s\n"
+    "fmla z28.s, p3/M, z6.s, z12.s\n"
+    "fmla z16.s, p3/M, z3.s, z12.s\n"
+    "fmla z21.s, p3/M, z1.s, z9.s\n"
+    "ld1w { z11.s }, p2/Z, [x14, x11, LSL #2]\n"
+    "fmla z24.s, p3/M, z3.s, z14.s\n"
+    "fmla z27.s, p3/M, z5.s, z10.s\n"
+    "fmla z23.s, p3/M, z2.s, z10.s\n"
+    "fmla z25.s, p3/M, z4.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x6, x15, LSL #2]\n"
+    "fmla z26.s, p3/M, z3.s, z9.s\n"
+    "fmla z22.s, p3/M, z0.s, z9.s\n"
+    "fmla z31.s, p3/M, z8.s, z11.s\n"
+    "fmla z19.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z10.s }, p2/Z, [x12, x4, LSL #2]\n"
+    "fmla z20.s, p3/M, z2.s, z9.s\n"
+    "fmla z21.s, p3/M, z2.s, z12.s\n"
+    "fmla z24.s, p3/M, z5.s, z9.s\n"
+    "fmla z25.s, p3/M, z5.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x7, x4, LSL #2]\n"
+    "fmla z26.s, p3/M, z4.s, z12.s\n"
+    "fmla z27.s, p3/M, z3.s, z12.s\n"
+    "fmla z22.s, p3/M, z1.s, z12.s\n"
+    "fmla z23.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x7, x13, LSL #2]\n"
+    "fmla z16.s, p3/M, z7.s, z10.s\n"
+    "fmla z17.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x13, LSL #2]\n"
+    "fmla z20.s, p3/M, z4.s, z9.s\n"
+    "fmla z21.s, p3/M, z3.s, z9.s\n"
+    "fmla z28.s, p3/M, z1.s, z9.s\n"
+    "fmla z29.s, p3/M, z0.s, z9.s\n"
+    "fmla z24.s, p3/M, z7.s, z9.s\n"
+    "fmla z25.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z10.s }, p2/Z, [x5, x8, LSL #2]\n"
+    "fmla z18.s, p3/M, z8.s, z11.s\n"
+    "fmla z19.s, p3/M, z7.s, z11.s\n"
+    "ld1w { z14.s }, p2/Z, [x16, x4, LSL #2]\n"
+    "fmla z26.s, p3/M, z8.s, z12.s\n"
+    "fmla z27.s, p3/M, z7.s, z12.s\n"
+    "fmla z22.s, p3/M, z5.s, z12.s\n"
+    "fmla z23.s, p3/M, z4.s, z12.s\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x5, x15, LSL #2]\n"
+    "fmla z20.s, p3/M, z7.s, z14.s\n"
+    "fmla z21.s, p3/M, z6.s, z14.s\n"
+    "fmla z28.s, p3/M, z4.s, z14.s\n"
+    "fmla z29.s, p3/M, z3.s, z14.s\n"
+    "fmla z16.s, p3/M, z1.s, z14.s\n"
+    "fmla z17.s, p3/M, z0.s, z14.s\n"
+    "ld1w { z14.s }, p2/Z, [x16, x13, LSL #2]\n"
+    "fmla z24.s, p3/M, z2.s, z10.s\n"
+    "fmla z25.s, p3/M, z1.s, z10.s\n"
+    "fmla z26.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x7]\n"
+    "fmla z18.s, p3/M, z2.s, z14.s\n"
+    "fmla z27.s, p3/M, z0.s, z9.s\n"
+    "fmla z20.s, p3/M, z3.s, z10.s\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "fmla z22.s, p3/M, z8.s, z14.s\n"
+    "fmla z23.s, p3/M, z7.s, z14.s\n"
+    "fmla z30.s, p3/M, z5.s, z14.s\n"
+    "fmla z31.s, p3/M, z4.s, z14.s\n"
+    "fmla z19.s, p3/M, z1.s, z14.s\n"
+    "ld1w { z11.s }, p2/Z, [x14, x8, LSL #2]\n"
+    "fmla z25.s, p3/M, z2.s, z9.s\n"
+    "fmla z26.s, p3/M, z1.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x7, x11, LSL #2]\n"
+    "fmla z24.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z14.s }, p2/Z, [x16]\n"
+    "fmla z17.s, p3/M, z4.s, z11.s\n"
+    "fmla z18.s, p3/M, z3.s, z11.s\n"
+    "fmla z27.s, p3/M, z8.s, z12.s\n"
+    "fmla z23.s, p3/M, z5.s, z12.s\n"
+    "fmla z31.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x16, x11, LSL #2]\n"
+    "fmla z20.s, p3/M, z6.s, z14.s\n"
+    "fmla z28.s, p3/M, z3.s, z14.s\n"
+    "fmla z16.s, p3/M, z0.s, z14.s\n"
+    "ld1w { z12.s }, p2/Z, [x12, x8, LSL #2]\n"
+    "fmla z19.s, p3/M, z2.s, z9.s\n"
+    "fmla z17.s, p3/M, z7.s, z12.s\n"
+    "fmla z18.s, p3/M, z6.s, z12.s\n"
+    "fmla z28.s, p3/M, z8.s, z11.s\n"
+    "fmla z29.s, p3/M, z7.s, z11.s\n"
+    "fmla z30.s, p3/M, z6.s, z11.s\n"
+    "fmla z16.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z10.s }, p2/Z, [x14, x15, LSL #2]\n"
+    "fmla z31.s, p3/M, z5.s, z9.s\n"
+    "fmla z17.s, p3/M, z5.s, z10.s\n"
+    "fmla z18.s, p3/M, z4.s, z10.s\n"
+    "fmla z19.s, p3/M, z3.s, z10.s\n"
+    "fmla z23.s, p3/M, z8.s, z9.s\n"
+    "ld1w { z14.s }, p2/Z, [x12, x15, LSL #2]\n"
+    "fmla z16.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x6, x4, LSL #2]\n"
+    "fmla z29.s, p3/M, z8.s, z10.s\n"
+    "fmla z30.s, p3/M, z7.s, z10.s\n"
+    "fmla z31.s, p3/M, z6.s, z10.s\n"
+    "fmla z17.s, p3/M, z8.s, z14.s\n"
+    "ld1w { z11.s }, p2/Z, [x6, x13, LSL #2]\n"
+    "fmla z18.s, p3/M, z7.s, z14.s\n"
+    "fmla z19.s, p3/M, z6.s, z14.s\n"
+    "ld1w { z10.s }, p2/Z, [x14, x4, LSL #2]\n"
+    "fmla z24.s, p3/M, z4.s, z9.s\n"
+    "fmla z25.s, p3/M, z3.s, z9.s\n"
+    "fmla z20.s, p3/M, z1.s, z9.s\n"
+    "fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x14, x13, LSL #2]\n"
+    "fmla z26.s, p3/M, z5.s, z11.s\n"
+    "fmla z27.s, p3/M, z4.s, z11.s\n"
+    "fmla z22.s, p3/M, z2.s, z11.s\n"
+    "fmla z23.s, p3/M, z1.s, z11.s\n"
+    "fmla z28.s, p3/M, z7.s, z10.s\n"
+    "fmla z29.s, p3/M, z6.s, z10.s\n"
+    "fmla z16.s, p3/M, z4.s, z10.s\n"
+    "fmla z17.s, p3/M, z3.s, z10.s\n"
+    "fmla z30.s, p3/M, z8.s, z12.s\n"
+    "fmla z31.s, p3/M, z7.s, z12.s\n"
+    "fmla z18.s, p3/M, z5.s, z12.s\n"
+    "fmla z19.s, p3/M, z4.s, z12.s\n"
+    ".inst 0xc1afc9b8  // fclamp { z24.s-z27.s }, z13.s, z15.s\n"
+    ".inst 0xc1afc9b4  // fclamp { z20.s-z23.s }, z13.s, z15.s\n"
+    "st1w { z24.s }, p0, [x28]\n"
+    ".inst 0xc1afc9bc  // fclamp { z28.s-z31.s }, z13.s, z15.s\n"
+    ".inst 0xc1afc9b0  // fclamp { z16.s-z19.s }, z13.s, z15.s\n"
+    "st1w { z25.s }, p0, [x28, x9, LSL #2]\n"
+    "st1w { z26.s }, p0, [x28, x24, LSL #2]\n"
+    "st1w { z27.s }, p0, [x28, x22, LSL #2]\n"
+    "st1w { z20.s }, p0, [x27]\n"
+    "st1w { z21.s }, p0, [x27, x9, LSL #2]\n"
+    "st1w { z22.s }, p0, [x27, x24, LSL #2]\n"
+    "st1w { z23.s }, p0, [x27, x22, LSL #2]\n"
+    "st1w { z28.s }, p0, [x25]\n"
+    "st1w { z29.s }, p0, [x25, x9, LSL #2]\n"
+    "st1w { z30.s }, p0, [x25, x24, LSL #2]\n"
+    "st1w { z31.s }, p0, [x25, x22, LSL #2]\n"
+    "st1w { z16.s }, p0, [x23]\n"
+    "st1w { z17.s }, p0, [x23, x9, LSL #2]\n"
+    "st1w { z18.s }, p0, [x23, x24, LSL #2]\n"
+    "st1w { z19.s }, p0, [x23, x22, LSL #2]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..2e2a45bab0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[36];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[14];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[5];
+      inptrs[3] = input_ptrs[15];
+      inptrs[4] = input_ptrs[30];
+      inptrs[5] = input_ptrs[35];
+      inptrs[6] = input_ptrs[20];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[4];
+      inptrs[9] = input_ptrs[21];
+      inptrs[10] = input_ptrs[6];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[24];
+      inptrs[13] = input_ptrs[8];
+      inptrs[14] = input_ptrs[29];
+      inptrs[15] = input_ptrs[9];
+      inptrs[16] = input_ptrs[31];
+      inptrs[17] = input_ptrs[13];
+      inptrs[18] = input_ptrs[34];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[19];
+      inptrs[22] = input_ptrs[3];
+      inptrs[23] = input_ptrs[12];
+      inptrs[24] = input_ptrs[22];
+      inptrs[25] = input_ptrs[17];
+      inptrs[26] = input_ptrs[18];
+      inptrs[27] = input_ptrs[26];
+      inptrs[28] = input_ptrs[23];
+      inptrs[29] = input_ptrs[32];
+      inptrs[30] = input_ptrs[27];
+      inptrs[31] = input_ptrs[33];
+      inptrs[32] = input_ptrs[7];
+      inptrs[33] = input_ptrs[10];
+      inptrs[34] = input_ptrs[25];
+      inptrs[35] = input_ptrs[28];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "ld1w { z13.s }, p3/Z, [x8]\n"
+    "addvl x8, x8, #1\n"
+    "ldp x23, x22, [x17, #0x0]\n"
+    "ldp x21, x20, [x17, #0x10]\n"
+    "cntw x16\n"
+    ".inst 0xa040c100  // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "mov x15, #0x0\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    ".inst 0xa040c104  // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "addvl x8, x8, #4\n"
+    "cmp x16, %x[n_channels]\n"
+    "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "sub x13, XZR, x16\n"
+    "ld1w { z8.s }, p3/Z, [x8]\n"
+    "addvl x8, x8, #1\n"
+    "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z29, z13\n fmla z29.s, p3/M, z4.s, z9.s\n"
+    "movprfx z16, z13\n fmla z16.s, p3/M, z8.s, z9.s\n"
+    "ldr x24, [x17, #0x20]\n"
+    "incw x13\n"
+    "movprfx z30, z13\n fmla z30.s, p3/M, z3.s, z9.s\n"
+    "movprfx z25, z13\n fmla z25.s, p3/M, z1.s, z9.s\n"
+    "ldr x20, [x17, #0x30]\n"
+    "mov p1.b, p2.b\n"
+    "movprfx z26, z13\n fmla z26.s, p3/M, z0.s, z9.s\n"
+    "ldr x21, [x17, #0x28]\n"
+    "movprfx z17, z13\n fmla z17.s, p3/M, z7.s, z9.s\n"
+    "whilelt p0.s, x16, %x[n_channels]\n"
+    "movprfx z18, z13\n fmla z18.s, p3/M, z6.s, z9.s\n"
+    "fmla z29.s, p3/M, z5.s, z12.s\n"
+    "ldr x23, [x17, #0x38]\n"
+    "movprfx z28, z13\n fmla z28.s, p3/M, z5.s, z9.s\n"
+    "movprfx z24, z13\n fmla z24.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x22, [x17, #0x40]\n"
+    "fmla z16.s, p3/M, z0.s, z10.s\n"
+    "movprfx z19, z13\n fmla z19.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z22.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x20, [x17, #0x48]\n"
+    "fmla z30.s, p3/M, z4.s, z12.s\n"
+    "fmla z25.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z21.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x27, [x17, #0x50]\n"
+    "fmla z26.s, p3/M, z1.s, z12.s\n"
+    "fmla z17.s, p3/M, z8.s, z12.s\n"
+    "ldr x26, [x17, #0x60]\n"
+    "fmla z18.s, p3/M, z7.s, z12.s\n"
+    "movprfx z20, z13\n fmla z20.s, p3/M, z6.s, z22.s\n"
+    "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x25, [x17, #0x68]\n"
+    "fmla z29.s, p3/M, z7.s, z9.s\n"
+    "fmla z19.s, p3/M, z6.s, z12.s\n"
+    "ldr x21, [x17, #0x58]\n"
+    "movprfx z31, z13\n fmla z31.s, p3/M, z3.s, z12.s\n"
+    "movprfx z27, z13\n fmla z27.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x24, [x17, #0x70]\n"
+    "movprfx z23, z13\n fmla z23.s, p3/M, z8.s, z21.s\n"
+    "fmla z30.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldr x23, [x17, #0x78]\n"
+    "fmla z25.s, p3/M, z4.s, z9.s\n"
+    "fmla z26.s, p3/M, z3.s, z9.s\n"
+    "ldr x22, [x17, #0x80]\n"
+    "movprfx z21, z13\n fmla z21.s, p3/M, z1.s, z9.s\n"
+    "movprfx z22, z13\n fmla z22.s, p3/M, z0.s, z9.s\n"
+    "ldr x20, [x17, #0x88]\n"
+    "ld1w { z13.s }, p3/Z, [x8]\n"
+    "fmla z28.s, p3/M, z8.s, z9.s\n"
+    "fmla z24.s, p3/M, z5.s, z9.s\n"
+    "ldr x12, [x14, #0x0]\n"
+    "addvl x8, x8, #1\n"
+    "fmla z20.s, p3/M, z2.s, z9.s\n"
+    "fmla z16.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "ldr x27, [x17, #0x90]\n"
+    "fmla z17.s, p3/M, z0.s, z10.s\n"
+    "fmla z18.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x21, [x17, #0x98]\n"
+    "fmla z29.s, p3/M, z8.s, z11.s\n"
+    "fmla z19.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "ldr x26, [x17, #0xa0]\n"
+    "fmla z30.s, p3/M, z7.s, z11.s\n"
+    "fmla z31.s, p3/M, z6.s, z11.s\n"
+    "ldr x11, [x14, #0x8]\n"
+    "fmla z25.s, p3/M, z5.s, z11.s\n"
+    "fmla z26.s, p3/M, z4.s, z11.s\n"
+    "ldr x10, [x14, #0x10]\n"
+    "fmla z27.s, p3/M, z3.s, z11.s\n"
+    "fmla z21.s, p3/M, z2.s, z11.s\n"
+    "ldr x9, [x14, #0x18]\n"
+    "fmla z22.s, p3/M, z1.s, z11.s\n"
+    "fmla z23.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "ldr x25, [x17, #0xa8]\n"
+    "fmla z16.s, p3/M, z3.s, z9.s\n"
+    "fmla z28.s, p3/M, z0.s, z9.s\n"
+    "fmla z24.s, p3/M, z6.s, z12.s\n"
+    "fmla z20.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x24, [x17, #0xb0]\n"
+    "fmla z17.s, p3/M, z4.s, z11.s\n"
+    "fmla z18.s, p3/M, z3.s, z11.s\n"
+    "fmla z29.s, p3/M, z1.s, z11.s\n"
+    "fmla z19.s, p3/M, z5.s, z10.s\n"
+    "fmla z31.s, p3/M, z2.s, z10.s\n"
+    "fmla z30.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z12.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x23, [x17, #0xb8]\n"
+    "fmla z27.s, p3/M, z8.s, z9.s\n"
+    "fmla z23.s, p3/M, z5.s, z9.s\n"
+    "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldr x22, [x17, #0xc0]\n"
+    "fmla z16.s, p3/M, z5.s, z11.s\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x20, [x17, #0xc8]\n"
+    "fmla z17.s, p3/M, z5.s, z12.s\n"
+    "fmla z18.s, p3/M, z4.s, z12.s\n"
+    "fmla z29.s, p3/M, z2.s, z12.s\n"
+    "fmla z19.s, p3/M, z3.s, z12.s\n"
+    "fmla z30.s, p3/M, z1.s, z12.s\n"
+    "fmla z31.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x28, [x17, #0xd8]\n"
+    "fmla z20.s, p3/M, z7.s, z10.s\n"
+    "fmla z21.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "ldr x21, [x17, #0xd0]\n"
+    "fmla z16.s, p3/M, z7.s, z11.s\n"
+    "fmla z17.s, p3/M, z6.s, z11.s\n"
+    "fmla z28.s, p3/M, z4.s, z11.s\n"
+    "fmla z29.s, p3/M, z3.s, z11.s\n"
+    "fmla z24.s, p3/M, z1.s, z11.s\n"
+    "fmla z25.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "ldr x27, [x17, #0xe0]\n"
+    "fmla z18.s, p3/M, z8.s, z9.s\n"
+    "fmla z22.s, p3/M, z8.s, z10.s\n"
+    "fmla z23.s, p3/M, z7.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z1.s, z9.s\n"
+    "ldr x26, [x17, #0xe8]\n"
+    "fmla z19.s, p3/M, z7.s, z9.s\n"
+    "fmla z30.s, p3/M, z5.s, z9.s\n"
+    "fmla z31.s, p3/M, z4.s, z9.s\n"
+    "fmla z26.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x25, [x17, #0xf0]\n"
+    "fmla z16.s, p3/M, z2.s, z11.s\n"
+    "fmla z17.s, p3/M, z1.s, z11.s\n"
+    "fmla z18.s, p3/M, z0.s, z11.s\n"
+    "fmla z28.s, p3/M, z7.s, z10.s\n"
+    "ld1w { z11.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x24, [x17, #0xf8]\n"
+    "fmla z29.s, p3/M, z6.s, z10.s\n"
+    "fmla z24.s, p3/M, z4.s, z10.s\n"
+    "fmla z25.s, p3/M, z3.s, z10.s\n"
+    "fmla z20.s, p3/M, z1.s, z10.s\n"
+    "fmla z21.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z4.s, z10.s\n"
+    "ldr x23, [x17, #0x100]\n"
+    "fmla z22.s, p3/M, z2.s, z10.s\n"
+    "fmla z17.s, p3/M, z2.s, z9.s\n"
+    "fmla z18.s, p3/M, z1.s, z9.s\n"
+    "fmla z19.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x20, [x17, #0x108]\n"
+    "fmla z16.s, p3/M, z6.s, z11.s\n"
+    "fmla z28.s, p3/M, z3.s, z11.s\n"
+    "fmla z24.s, p3/M, z0.s, z11.s\n"
+    "fmla z30.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x22, [x17, #0x110]\n"
+    "fmla z31.s, p3/M, z7.s, z10.s\n"
+    "fmla z26.s, p3/M, z5.s, z10.s\n"
+    "fmla z23.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x28, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z2.s, z9.s\n"
+    "ldr x21, [x17, #0x118]\n"
+    "fmla z20.s, p3/M, z0.s, z11.s\n"
+    "fmla z21.s, p3/M, z4.s, z10.s\n"
+    "fmla z22.s, p3/M, z3.s, z10.s\n"
+    "fmla z19.s, p3/M, z8.s, z9.s\n"
+    "fmla z31.s, p3/M, z5.s, z9.s\n"
+    "fmla z28.s, p3/M, z6.s, z11.s\n"
+    "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "fmla z24.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "fmla z25.s, p3/M, z7.s, z10.s\n"
+    "fmla z26.s, p3/M, z6.s, z10.s\n"
+    "fmla z20.s, p3/M, z5.s, z10.s\n"
+    "fmla z27.s, p3/M, z5.s, z9.s\n"
+    "fmla z23.s, p3/M, z2.s, z9.s\n"
+    "fmla z21.s, p3/M, z7.s, z12.s\n"
+    "fmla z22.s, p3/M, z6.s, z12.s\n"
+    "fmla z24.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z20.s, p3/M, z8.s, z12.s\n"
+    "fmla z25.s, p3/M, z8.s, z11.s\n"
+    "fmla z26.s, p3/M, z7.s, z11.s\n"
+    "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z6.s, z11.s\n"
+    "fmla z21.s, p3/M, z5.s, z11.s\n"
+    "fmla z22.s, p3/M, z4.s, z11.s\n"
+    "fmla z23.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldp x20, x25, [x17, #0x0]\n"
+    "fmla z31.s, p3/M, z8.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "fmla z16.s, p3/M, z4.s, z10.s\n"
+    "fmla z17.s, p3/M, z3.s, z10.s\n"
+    "fmla z18.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z9.s }, p0/Z, [x20, x16, LSL #2]\n"
+    "fmla z19.s, p3/M, z4.s, z11.s\n"
+    "fmla z21.s, p3/M, z8.s, z12.s\n"
+    "fmla z22.s, p3/M, z7.s, z12.s\n"
+    "fmla z23.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "fmla z28.s, p3/M, z1.s, z10.s\n"
+    "fmla z29.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z0.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldp x20, x24, [x17, #0x10]\n"
+    "fmla z30.s, p3/M, z2.s, z11.s\n"
+    "fmla z31.s, p3/M, z1.s, z11.s\n"
+    "incw x15\n"
+    "ld1w { z11.s }, p0/Z, [x20, x16, LSL #2]\n"
+    ".inst 0xc1afc9d0  // fclamp { z16.s-z19.s }, z14.s, z15.s\n"
+    "st1w { z16.s }, p1, [x12, x13, LSL #2]\n"
+    "ldr x23, [x14, #0x20]\n"
+    "fmla z24.s, p3/M, z7.s, z12.s\n"
+    "st1w { z17.s }, p1, [x11, x13, LSL #2]\n"
+    "ldr x22, [x14, #0x28]\n"
+    "fmla z25.s, p3/M, z6.s, z12.s\n"
+    "fmla z26.s, p3/M, z8.s, z0.s\n"
+    "st1w { z18.s }, p1, [x10, x13, LSL #2]\n"
+    "ldr x21, [x14, #0x30]\n"
+    "fmla z27.s, p3/M, z7.s, z0.s\n"
+    ".inst 0xc1afc9dc  // fclamp { z28.s-z31.s }, z14.s, z15.s\n"
+    "st1w { z19.s }, p1, [x9, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x38]\n"
+    "fmla z20.s, p3/M, z4.s, z12.s\n"
+    "fmla z21.s, p3/M, z3.s, z12.s\n"
+    "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
+    "ldr x23, [x14, #0x40]\n"
+    "fmla z22.s, p3/M, z5.s, z0.s\n"
+    "fmla z23.s, p3/M, z4.s, z0.s\n"
+    "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
+    "ldr x22, [x14, #0x48]\n"
+    ".inst 0xc1afc9d8  // fclamp { z24.s-z27.s }, z14.s, z15.s\n"
+    "ld1w { z10.s }, p0/Z, [x25, x16, LSL #2]\n"
+    "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
+    "ldr x21, [x14, #0x50]\n"
+    "ld1w { z12.s }, p0/Z, [x24, x16, LSL #2]\n"
+    "incw x16\n"
+    "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x58]\n"
+    ".inst 0xa040c100  // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "st1w { z24.s }, p1, [x23, x13, LSL #2]\n"
+    "ldr x23, [x14, #0x60]\n"
+    "whilelt p2.s, x15, %x[n_channels]\n"
+    ".inst 0xa040c104  // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+    "st1w { z25.s }, p1, [x22, x13, LSL #2]\n"
+    "ldr x22, [x14, #0x68]\n"
+    "addvl x8, x8, #4\n"
+    "cmp x16, %x[n_channels]\n"
+    "st1w { z26.s }, p1, [x21, x13, LSL #2]\n"
+    "ldr x21, [x14, #0x70]\n"
+    ".inst 0xc1afc9d4  // fclamp { z20.s-z23.s }, z14.s, z15.s\n"
+    "ld1w { z8.s }, p3/Z, [x8]\n"
+    "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x78]\n"
+    "addvl x8, x8, #1\n"
+    "st1w { z20.s }, p1, [x23, x13, LSL #2]\n"
+    "st1w { z21.s }, p1, [x22, x13, LSL #2]\n"
+    "st1w { z22.s }, p1, [x21, x13, LSL #2]\n"
+    "st1w { z23.s }, p1, [x20, x13, LSL #2]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z29, z13\n fmla z29.s, p3/M, z4.s, z9.s\n"
+    "movprfx z20, z13\n fmla z20.s, p3/M, z8.s, z9.s\n"
+    "ldr x24, [x17, #0x20]\n"
+    "incw x13\n"
+    "movprfx z30, z13\n fmla z30.s, p3/M, z3.s, z9.s\n"
+    "movprfx z25, z13\n fmla z25.s, p3/M, z1.s, z9.s\n"
+    "ldr x20, [x17, #0x30]\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z26, z13\n fmla z26.s, p3/M, z0.s, z9.s\n"
+    "ldr x23, [x17, #0x28]\n"
+    "movprfx z21, z13\n fmla z21.s, p3/M, z7.s, z9.s\n"
+    "movprfx z22, z13\n fmla z22.s, p3/M, z6.s, z9.s\n"
+    "fmla z29.s, p3/M, z5.s, z12.s\n"
+    "ldr x22, [x17, #0x38]\n"
+    "movprfx z28, z13\n fmla z28.s, p3/M, z5.s, z9.s\n"
+    "movprfx z24, z13\n fmla z24.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x21, [x17, #0x40]\n"
+    "fmla z20.s, p3/M, z0.s, z10.s\n"
+    "movprfx z23, z13\n fmla z23.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z19.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x20, [x17, #0x48]\n"
+    "fmla z30.s, p3/M, z4.s, z12.s\n"
+    "fmla z25.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x27, [x17, #0x50]\n"
+    "fmla z26.s, p3/M, z1.s, z12.s\n"
+    "fmla z21.s, p3/M, z8.s, z12.s\n"
+    "ldr x26, [x17, #0x60]\n"
+    "fmla z22.s, p3/M, z7.s, z12.s\n"
+    "movprfx z16, z13\n fmla z16.s, p3/M, z6.s, z19.s\n"
+    "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x25, [x17, #0x68]\n"
+    "fmla z29.s, p3/M, z7.s, z9.s\n"
+    "fmla z23.s, p3/M, z6.s, z12.s\n"
+    "ldr x20, [x17, #0x58]\n"
+    "movprfx z31, z13\n fmla z31.s, p3/M, z3.s, z12.s\n"
+    "movprfx z27, z13\n fmla z27.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldr x24, [x17, #0x70]\n"
+    "movprfx z19, z13\n fmla z19.s, p3/M, z8.s, z17.s\n"
+    "fmla z30.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z10.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x23, [x17, #0x78]\n"
+    "fmla z25.s, p3/M, z4.s, z9.s\n"
+    "fmla z26.s, p3/M, z3.s, z9.s\n"
+    "ldr x22, [x17, #0x80]\n"
+    "movprfx z17, z13\n fmla z17.s, p3/M, z1.s, z9.s\n"
+    "movprfx z18, z13\n fmla z18.s, p3/M, z0.s, z9.s\n"
+    "ldr x21, [x17, #0x88]\n"
+    "fmla z28.s, p3/M, z8.s, z9.s\n"
+    "fmla z24.s, p3/M, z5.s, z9.s\n"
+    "ldr x12, [x14, #0x0]\n"
+    "fmla z16.s, p3/M, z2.s, z9.s\n"
+    "fmla z20.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "ldr x27, [x17, #0x90]\n"
+    "fmla z21.s, p3/M, z0.s, z12.s\n"
+    "fmla z22.s, p3/M, z2.s, z10.s\n"
+    "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x20, [x17, #0x98]\n"
+    "fmla z29.s, p3/M, z8.s, z11.s\n"
+    "fmla z23.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "ldr x26, [x17, #0xa0]\n"
+    "fmla z30.s, p3/M, z7.s, z11.s\n"
+    "fmla z31.s, p3/M, z6.s, z11.s\n"
+    "ldr x11, [x14, #0x8]\n"
+    "fmla z25.s, p3/M, z5.s, z11.s\n"
+    "fmla z26.s, p3/M, z4.s, z11.s\n"
+    "ldr x10, [x14, #0x10]\n"
+    "fmla z27.s, p3/M, z3.s, z11.s\n"
+    "fmla z17.s, p3/M, z2.s, z11.s\n"
+    "ldr x9, [x14, #0x18]\n"
+    "fmla z18.s, p3/M, z1.s, z11.s\n"
+    "fmla z19.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z10.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "ldr x25, [x17, #0xa8]\n"
+    "fmla z20.s, p3/M, z3.s, z9.s\n"
+    "fmla z28.s, p3/M, z0.s, z9.s\n"
+    "fmla z24.s, p3/M, z6.s, z12.s\n"
+    "fmla z16.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x24, [x17, #0xb0]\n"
+    "fmla z21.s, p3/M, z4.s, z10.s\n"
+    "fmla z22.s, p3/M, z3.s, z10.s\n"
+    "fmla z29.s, p3/M, z1.s, z10.s\n"
+    "fmla z23.s, p3/M, z5.s, z13.s\n"
+    "fmla z31.s, p3/M, z2.s, z13.s\n"
+    "fmla z30.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z13.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x23, [x17, #0xb8]\n"
+    "fmla z27.s, p3/M, z8.s, z12.s\n"
+    "fmla z19.s, p3/M, z5.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldr x22, [x17, #0xc0]\n"
+    "fmla z20.s, p3/M, z5.s, z10.s\n"
+    "fmla z28.s, p3/M, z2.s, z10.s\n"
+    "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x21, [x17, #0xc8]\n"
+    "fmla z21.s, p3/M, z5.s, z13.s\n"
+    "fmla z22.s, p3/M, z4.s, z13.s\n"
+    "fmla z29.s, p3/M, z2.s, z13.s\n"
+    "fmla z23.s, p3/M, z3.s, z13.s\n"
+    "fmla z30.s, p3/M, z1.s, z13.s\n"
+    "fmla z31.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z10.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x28, [x17, #0xd8]\n"
+    "fmla z16.s, p3/M, z7.s, z9.s\n"
+    "fmla z17.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z11.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "ldr x20, [x17, #0xd0]\n"
+    "fmla z20.s, p3/M, z7.s, z12.s\n"
+    "fmla z21.s, p3/M, z6.s, z12.s\n"
+    "fmla z28.s, p3/M, z4.s, z12.s\n"
+    "fmla z29.s, p3/M, z3.s, z12.s\n"
+    "fmla z24.s, p3/M, z1.s, z12.s\n"
+    "fmla z25.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "ldr x27, [x17, #0xe0]\n"
+    "fmla z22.s, p3/M, z8.s, z10.s\n"
+    "fmla z18.s, p3/M, z8.s, z11.s\n"
+    "fmla z19.s, p3/M, z7.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z1.s, z10.s\n"
+    "ldr x26, [x17, #0xe8]\n"
+    "fmla z23.s, p3/M, z7.s, z10.s\n"
+    "fmla z30.s, p3/M, z5.s, z10.s\n"
+    "fmla z31.s, p3/M, z4.s, z10.s\n"
+    "fmla z26.s, p3/M, z2.s, z10.s\n"
+    "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x25, [x17, #0xf0]\n"
+    "fmla z20.s, p3/M, z2.s, z12.s\n"
+    "fmla z21.s, p3/M, z1.s, z12.s\n"
+    "fmla z22.s, p3/M, z0.s, z12.s\n"
+    "fmla z28.s, p3/M, z7.s, z11.s\n"
+    "ld1w { z12.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x24, [x17, #0xf8]\n"
+    "fmla z29.s, p3/M, z6.s, z11.s\n"
+    "fmla z24.s, p3/M, z4.s, z11.s\n"
+    "fmla z25.s, p3/M, z3.s, z11.s\n"
+    "fmla z16.s, p3/M, z1.s, z11.s\n"
+    "fmla z17.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z4.s, z10.s\n"
+    "ldr x23, [x17, #0x100]\n"
+    "fmla z18.s, p3/M, z2.s, z10.s\n"
+    "fmla z21.s, p3/M, z2.s, z9.s\n"
+    "fmla z22.s, p3/M, z1.s, z9.s\n"
+    "fmla z23.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x22, [x17, #0x108]\n"
+    "fmla z20.s, p3/M, z6.s, z12.s\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "fmla z24.s, p3/M, z0.s, z12.s\n"
+    "fmla z30.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z12.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x21, [x17, #0x110]\n"
+    "fmla z31.s, p3/M, z7.s, z10.s\n"
+    "fmla z26.s, p3/M, z5.s, z10.s\n"
+    "fmla z19.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z9.s }, p2/Z, [x28, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ldr x20, [x17, #0x118]\n"
+    "fmla z16.s, p3/M, z0.s, z12.s\n"
+    "fmla z17.s, p3/M, z4.s, z9.s\n"
+    "fmla z18.s, p3/M, z3.s, z9.s\n"
+    "fmla z23.s, p3/M, z8.s, z11.s\n"
+    "fmla z31.s, p3/M, z5.s, z11.s\n"
+    "fmla z28.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "fmla z24.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "fmla z25.s, p3/M, z7.s, z9.s\n"
+    "fmla z26.s, p3/M, z6.s, z9.s\n"
+    "fmla z16.s, p3/M, z5.s, z9.s\n"
+    "fmla z27.s, p3/M, z5.s, z10.s\n"
+    "fmla z19.s, p3/M, z2.s, z10.s\n"
+    "fmla z17.s, p3/M, z7.s, z12.s\n"
+    "fmla z18.s, p3/M, z6.s, z12.s\n"
+    "fmla z24.s, p3/M, z8.s, z9.s\n"
+    "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z16.s, p3/M, z8.s, z12.s\n"
+    "fmla z25.s, p3/M, z8.s, z11.s\n"
+    "fmla z26.s, p3/M, z7.s, z11.s\n"
+    "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z6.s, z11.s\n"
+    "fmla z17.s, p3/M, z5.s, z11.s\n"
+    "fmla z18.s, p3/M, z4.s, z11.s\n"
+    "fmla z19.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "fmla z31.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "fmla z20.s, p3/M, z4.s, z9.s\n"
+    "fmla z21.s, p3/M, z3.s, z9.s\n"
+    "fmla z22.s, p3/M, z5.s, z11.s\n"
+    "fmla z23.s, p3/M, z4.s, z11.s\n"
+    "fmla z17.s, p3/M, z8.s, z12.s\n"
+    "fmla z18.s, p3/M, z7.s, z12.s\n"
+    "fmla z19.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z13.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "fmla z28.s, p3/M, z1.s, z9.s\n"
+    "fmla z29.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z0.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z11.s\n"
+    "fmla z31.s, p3/M, z1.s, z11.s\n"
+    ".inst 0xc1afc9d4  // fclamp { z20.s-z23.s }, z14.s, z15.s\n"
+    "st1w { z20.s }, p0, [x12, x13, LSL #2]\n"
+    "ldr x23, [x14, #0x20]\n"
+    "fmla z24.s, p3/M, z7.s, z13.s\n"
+    "st1w { z21.s }, p0, [x11, x13, LSL #2]\n"
+    "ldr x22, [x14, #0x28]\n"
+    "fmla z25.s, p3/M, z6.s, z13.s\n"
+    "fmla z26.s, p3/M, z8.s, z0.s\n"
+    "st1w { z22.s }, p0, [x10, x13, LSL #2]\n"
+    "ldr x21, [x14, #0x30]\n"
+    "fmla z27.s, p3/M, z7.s, z0.s\n"
+    ".inst 0xc1afc9dc  // fclamp { z28.s-z31.s }, z14.s, z15.s\n"
+    "st1w { z23.s }, p0, [x9, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x38]\n"
+    "fmla z16.s, p3/M, z4.s, z13.s\n"
+    "fmla z17.s, p3/M, z3.s, z13.s\n"
+    "st1w { z28.s }, p0, [x23, x13, LSL #2]\n"
+    "ldr x23, [x14, #0x40]\n"
+    "fmla z18.s, p3/M, z5.s, z0.s\n"
+    "fmla z19.s, p3/M, z4.s, z0.s\n"
+    "st1w { z29.s }, p0, [x22, x13, LSL #2]\n"
+    "ldr x22, [x14, #0x48]\n"
+    ".inst 0xc1afc9d8  // fclamp { z24.s-z27.s }, z14.s, z15.s\n"
+    ".inst 0xc1afc9d0  // fclamp { z16.s-z19.s }, z14.s, z15.s\n"
+    "st1w { z30.s }, p0, [x21, x13, LSL #2]\n"
+    "ldr x21, [x14, #0x50]\n"
+    "st1w { z31.s }, p0, [x20, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x58]\n"
+    "st1w { z24.s }, p0, [x23, x13, LSL #2]\n"
+    "ldr x23, [x14, #0x60]\n"
+    "st1w { z25.s }, p0, [x22, x13, LSL #2]\n"
+    "ldr x22, [x14, #0x68]\n"
+    "st1w { z26.s }, p0, [x21, x13, LSL #2]\n"
+    "ldr x21, [x14, #0x70]\n"
+    "st1w { z27.s }, p0, [x20, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x78]\n"
+    "st1w { z16.s }, p0, [x23, x13, LSL #2]\n"
+    "st1w { z17.s }, p0, [x22, x13, LSL #2]\n"
+    "st1w { z18.s }, p0, [x21, x13, LSL #2]\n"
+    "st1w { z19.s }, p0, [x20, x13, LSL #2]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..dcffffeb21
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..066b935486
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "mov x2, #0x0\n"
+    "mov x3, #0x0\n"
+    "1:"  // Tile loop
+    "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x4\n"
+    "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "mul x20, x2, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "madd x20, x3, x4, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "add x5, x5, x20, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x6, x5, x21, LSL #2\n"
+    "add x7, x6, x21, LSL #2\n"
+    "add x8, x4, x4\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x16, x7, x21, LSL #2\n"
+    "add x15, x8, x4\n"
+    "add x14, x16, x21, LSL #2\n"
+    "add x13, x15, x4\n"
+    "cbnz x3, 2f\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "sub x21, x20, x3\n"
+    "sub x21, x21, #0x1\n"
+    "lsl x12, %x[n_channels], #0x2\n"
+    "mov x20, #0x10\n"
+    "and x21, x21, #0x3fffff\n"
+    "mul x20, x20, x4\n"
+    "orr x12, x12, x21, LSL #22\n"
+    "orr x12, x12, x20, LSL #38\n"
+    "add x27, x7, x8, LSL #2\n"
+    "add x26, x5, x4, LSL #2\n"
+    "add x25, x5, x15, LSL #2\n"
+    "add x24, x5, x13, LSL #2\n"
+    "add x23, x6, x4, LSL #2\n"
+    "add x22, x5, x8, LSL #2\n"
+    "add x21, x6, x15, LSL #2\n"
+    "add x20, x6, x13, LSL #2\n"
+    "add x11, x6, x8, LSL #2\n"
+    "add x10, x16, x4, LSL #2\n"
+    "add x9, x7, x4, LSL #2\n"
+    "add x28, x16, x15, LSL #2\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    "add x27, x7, x15, LSL #2\n"
+    ".inst 0xf8ac48ba  // rprfm pldonce, x12, [x5]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    "add x26, x16, x13, LSL #2\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    "add x25, x7, x13, LSL #2\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    "add x24, x14, x4, LSL #2\n"
+    ".inst 0xf8ac48da  // rprfm pldonce, x12, [x6]\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    "add x23, x16, x8, LSL #2\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    "add x22, x14, x15, LSL #2\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    "add x21, x14, x8, LSL #2\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "add x20, x14, x13, LSL #2\n"
+    ".inst 0xf8ac497a  // rprfm pldonce, x12, [x11]\n"
+    ".inst 0xf8ac4a1a  // rprfm pldonce, x12, [x16]\n"
+    ".inst 0xf8ac48fa  // rprfm pldonce, x12, [x7]\n"
+    ".inst 0xf8ac495a  // rprfm pldonce, x12, [x10]\n"
+    ".inst 0xf8ac493a  // rprfm pldonce, x12, [x9]\n"
+    ".inst 0xf8ac4b9a  // rprfm pldonce, x12, [x28]\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    ".inst 0xf8ac49da  // rprfm pldonce, x12, [x14]\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x2, x22\n"  // offset = tile_i * ld_output_row
+    "mov x20, #0x2\n"
+    "ld1w { z22.s }, p3/Z, [x17]\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "madd x21, x3, x25, x21\n"  // offset += tile_j * ld_output_col
+    "addvl x17, x17, #1\n"
+    ".inst 0xa040c220  // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "mul x21, x21, x20\n"  // offset *= output_tile_size
+    "cntw x23\n"
+    "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "addvl x17, x17, #4\n"
+    "add x24, x24, x21, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    ".inst 0xa040c224  // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "addvl x17, x17, #4\n"
+    "ld1rw { z24.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "cmp x23, %x[n_channels]\n"
+    "add x22, x24, x22, LSL #2\n"
+    "ld1w { z8.s }, p3/Z, [x17]\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x23\n"
+    "ld1w { z9.s }, p2/Z, [x7, x8, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x5]\n"
+    "addvl x17, x17, #1\n"
+    "ld1w { z11.s }, p2/Z, [x5, x4, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x5, x15, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x5, x13, LSL #2]\n"
+    "ld1w { z14.s }, p2/Z, [x6]\n"
+    "ld1w { z15.s }, p2/Z, [x6, x4, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z28, z22\n fmla z28.s, p3/M, z8.s, z9.s\n"
+    "movprfx z29, z22\n fmla z29.s, p3/M, z6.s, z9.s\n"
+    "whilelt p1.s, x23, %x[n_channels]\n"
+    "incw x21\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z18.s }, p2/Z, [x6, x13, LSL #2]\n"
+    "incw x23\n"
+    "fmla z28.s, p3/M, z1.s, z11.s\n"
+    "fmla z29.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z27.s }, p2/Z, [x6, x15, LSL #2]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.s, p3/M, z3.s, z14.s\n"
+    "fmla z29.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x6, x8, LSL #2]\n"
+    "addvl x5, x5, #1\n"
+    "fmla z28.s, p3/M, z4.s, z15.s\n"
+    "fmla z29.s, p3/M, z4.s, z27.s\n"
+    "ld1w { z25.s }, p2/Z, [x16]\n"
+    "addvl x6, x6, #1\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z29.s, p3/M, z5.s, z18.s\n"
+    "ld1w { z12.s }, p2/Z, [x7]\n"
+    "incw x20\n"
+    "movprfx z30, z22\n fmla z30.s, p3/M, z2.s, z9.s\n"
+    "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z18.s }, p2/Z, [x7, x15, LSL #2]\n"
+    "fmla z28.s, p3/M, z5.s, z17.s\n"
+    "fmla z29.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x16, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z25.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "ld1w { z10.s }, p2/Z, [x16, x4, LSL #2]\n"
+    "fmla z30.s, p3/M, z0.s, z12.s\n"
+    "fmla z31.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x16, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "fmla z31.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x7, x4, LSL #2]\n"
+    "fmla z28.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z22.s }, p2/Z, [x7, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z16.s\n"
+    "addvl x7, x7, #1\n"
+    "fmla z31.s, p3/M, z2.s, z22.s\n"
+    "fmla z28.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14]\n"
+    "ld1w { z17.s }, p2/Z, [x16, x8, LSL #2]\n"
+    "fmla z30.s, p3/M, z6.s, z16.s\n"
+    "fmla z31.s, p3/M, z3.s, z17.s\n"
+    "addvl x16, x16, #1\n"
+    "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "fmla z29.s, p3/M, z7.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    "fmla z30.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x14, x8, LSL #2]\n"
+    "fmla z31.s, p3/M, z6.s, z17.s\n"
+    "fmla z29.s, p3/M, z8.s, z22.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z16.s\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "ld1w { z22.s }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "cmp x23, %x[n_channels]\n"
+    ".inst 0xc1b8cb5c  // fclamp { z28.s-z31.s }, z26.s, z24.s\n"
+    ".inst 0xa040c220  // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "addvl x14, x14, #1\n"
+    "st1w { z28.s }, p0, [x24]\n"
+    ".inst 0xa040c224  // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
+    "addvl x24, x24, #1\n"
+    "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
+    "st1w { z30.s }, p0, [x22]\n"
+    "ld1w { z10.s }, p1/Z, [x5]\n"
+    "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
+    "addvl x22, x22, #1\n"
+    "ld1w { z11.s }, p1/Z, [x5, x4, LSL #2]\n"
+    "ld1w { z12.s }, p1/Z, [x5, x15, LSL #2]\n"
+    "ld1w { z13.s }, p1/Z, [x5, x13, LSL #2]\n"
+    "ld1w { z14.s }, p1/Z, [x6]\n"
+    "ld1w { z15.s }, p1/Z, [x6, x4, LSL #2]\n"
+    "ld1w { z16.s }, p1/Z, [x5, x8, LSL #2]\n"
+    "ld1w { z8.s }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z28, z22\n fmla z28.s, p3/M, z8.s, z9.s\n"
+    "movprfx z29, z22\n fmla z29.s, p3/M, z6.s, z9.s\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "add x3, x3, #0x1\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z18.s }, p2/Z, [x6, x13, LSL #2]\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "fmla z28.s, p3/M, z1.s, z11.s\n"
+    "fmla z29.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x6, x15, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "fmla z28.s, p3/M, z3.s, z14.s\n"
+    "fmla z29.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z20.s }, p2/Z, [x6, x8, LSL #2]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z28.s, p3/M, z4.s, z15.s\n"
+    "fmla z29.s, p3/M, z4.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x16]\n"
+    "cmp x3, x20\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z29.s, p3/M, z5.s, z18.s\n"
+    "ld1w { z18.s }, p2/Z, [x7]\n"
+    "add x20, x2, #0x1\n"
+    "movprfx z30, z22\n fmla z30.s, p3/M, z2.s, z9.s\n"
+    "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z19.s }, p2/Z, [x7, x15, LSL #2]\n"
+    "csel x2, x2, x20, LT\n"
+    "fmla z28.s, p3/M, z5.s, z20.s\n"
+    "fmla z29.s, p3/M, z3.s, z20.s\n"
+    "ld1w { z16.s }, p2/Z, [x16, x15, LSL #2]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.s, p3/M, z3.s, z17.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x16, x4, LSL #2]\n"
+    "csel x3, x3, XZR, LT\n"
+    "fmla z30.s, p3/M, z0.s, z18.s\n"
+    "fmla z31.s, p3/M, z1.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x16, x13, LSL #2]\n"
+    "cmp x2, x21\n"
+    "fmla z30.s, p3/M, z4.s, z17.s\n"
+    "fmla z31.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x7, x4, LSL #2]\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "ld1w { z18.s }, p2/Z, [x7, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z16.s\n"
+    "fmla z31.s, p3/M, z2.s, z18.s\n"
+    "fmla z28.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14]\n"
+    "ld1w { z17.s }, p2/Z, [x16, x8, LSL #2]\n"
+    "fmla z30.s, p3/M, z6.s, z16.s\n"
+    "fmla z31.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "fmla z29.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    "fmla z30.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x14, x8, LSL #2]\n"
+    "fmla z31.s, p3/M, z6.s, z17.s\n"
+    "fmla z29.s, p3/M, z8.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z16.s\n"
+    ".inst 0xc1b8cb5c  // fclamp { z28.s-z31.s }, z26.s, z24.s\n"
+    "st1w { z28.s }, p0, [x24]\n"
+    "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
+    "st1w { z30.s }, p0, [x22]\n"
+    "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..dc7a40ff54
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[25];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[1];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[2];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[7];
+      inptrs[11] = input_ptrs[15];
+      inptrs[12] = input_ptrs[10];
+      inptrs[13] = input_ptrs[16];
+      inptrs[14] = input_ptrs[11];
+      inptrs[15] = input_ptrs[18];
+      inptrs[16] = input_ptrs[13];
+      inptrs[17] = input_ptrs[19];
+      inptrs[18] = input_ptrs[20];
+      inptrs[19] = input_ptrs[14];
+      inptrs[20] = input_ptrs[21];
+      inptrs[21] = input_ptrs[17];
+      inptrs[22] = input_ptrs[23];
+      inptrs[23] = input_ptrs[22];
+      inptrs[24] = input_ptrs[24];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ptrue p3.b\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "ld1w { z26.s }, p3/Z, [x15]\n"
+    "addvl x15, x15, #1\n"
+    "ldp x14, x13, [x20, #0x0]\n"
+    "cntw x12\n"
+    ".inst 0xa040c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
+    "addvl x15, x15, #4\n"
+    "ldp x11, x10, [x20, #0x10]\n"
+    "mov x9, #0x0\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    ".inst 0xa040c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
+    "ldp x28, x26, [x16, #0x0]\n"
+    "addvl x15, x15, #4\n"
+    "cmp x12, %x[n_channels]\n"
+    "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ldp x25, x24, [x16, #0x10]\n"
+    "ld1rw { z24.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "sub x27, XZR, x12\n"
+    "ldp x23, x22, [x16, #0x20]\n"
+    "ld1w { z8.s }, p3/Z, [x15]\n"
+    "addvl x15, x15, #1\n"
+    "ldp x21, x20, [x16, #0x30]\n"
+    "ld1w { z9.s }, p2/Z, [x28, x9, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z14.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z15.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z28, z26\n fmla z28.s, p3/M, z8.s, z9.s\n"
+    "movprfx z29, z26\n fmla z29.s, p3/M, z6.s, z9.s\n"
+    "ldr x21, [x16, #0x40]\n"
+    "whilelt p1.s, x12, %x[n_channels]\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "ldr x20, [x16, #0x48]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z28.s, p3/M, z1.s, z11.s\n"
+    "fmla z29.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z22.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ldr x20, [x16, #0x50]\n"
+    "fmla z28.s, p3/M, z3.s, z14.s\n"
+    "fmla z29.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x16, #0x58]\n"
+    "fmla z28.s, p3/M, z4.s, z15.s\n"
+    "fmla z29.s, p3/M, z4.s, z22.s\n"
+    "ldr x21, [x16, #0x78]\n"
+    "ld1w { z23.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z29.s, p3/M, z5.s, z18.s\n"
+    "ldr x20, [x16, #0x60]\n"
+    "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "movprfx z30, z26\n fmla z30.s, p3/M, z2.s, z9.s\n"
+    "movprfx z31, z26\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ldr x20, [x16, #0x80]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z28.s, p3/M, z5.s, z17.s\n"
+    "fmla z29.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla z30.s, p3/M, z3.s, z23.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "ldr x20, [x16, #0x88]\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z30.s, p3/M, z0.s, z13.s\n"
+    "fmla z31.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x16, #0x70]\n"
+    "ldr x20, [x16, #0x98]\n"
+    "fmla z30.s, p3/M, z4.s, z17.s\n"
+    "fmla z31.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z28.s, p3/M, z6.s, z13.s\n"
+    "ld1w { z4.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x16, #0x90]\n"
+    "fmla z30.s, p3/M, z1.s, z16.s\n"
+    "ldr x20, [x16, #0xa8]\n"
+    "fmla z31.s, p3/M, z2.s, z4.s\n"
+    "fmla z28.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x16, #0xa0]\n"
+    "fmla z30.s, p3/M, z6.s, z16.s\n"
+    "fmla z31.s, p3/M, z3.s, z17.s\n"
+    "ldr x20, [x16, #0xb0]\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "fmla z29.s, p3/M, z7.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    "fmla z30.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x16, #0xc0]\n"
+    "fmla z31.s, p3/M, z6.s, z17.s\n"
+    "fmla z29.s, p3/M, z8.s, z4.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldp x20, x26, [x16, #0x0]\n"
+    "fmla z30.s, p3/M, z8.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z16.s\n"
+    "ldp x25, x24, [x16, #0x10]\n"
+    "ld1w { z26.s }, p3/Z, [x15]\n"
+    "addvl x15, x15, #1\n"
+    "incw x9\n"
+    "ldp x23, x22, [x16, #0x20]\n"
+    "ld1w { z9.s }, p1/Z, [x20, x12, LSL #2]\n"
+    "incw x27\n"
+    "mov p0.b, p2.b\n"
+    "ldp x21, x20, [x16, #0x30]\n"
+    "ld1w { z10.s }, p1/Z, [x26, x12, LSL #2]\n"
+    "whilelt p2.s, x9, %x[n_channels]\n"
+    ".inst 0xc1b8cb3c  // fclamp { z28.s-z31.s }, z25.s, z24.s\n"
+    "ld1w { z11.s }, p1/Z, [x25, x12, LSL #2]\n"
+    "st1w { z28.s }, p0, [x14, x27, LSL #2]\n"
+    "ld1w { z12.s }, p1/Z, [x24, x12, LSL #2]\n"
+    "st1w { z29.s }, p0, [x13, x27, LSL #2]\n"
+    "ld1w { z13.s }, p1/Z, [x23, x12, LSL #2]\n"
+    "st1w { z30.s }, p0, [x11, x27, LSL #2]\n"
+    "ld1w { z14.s }, p1/Z, [x22, x12, LSL #2]\n"
+    "st1w { z31.s }, p0, [x10, x27, LSL #2]\n"
+    "ld1w { z15.s }, p1/Z, [x21, x12, LSL #2]\n"
+    "ld1w { z16.s }, p1/Z, [x20, x12, LSL #2]\n"
+    "incw x12\n"
+    "cmp x12, %x[n_channels]\n"
+    ".inst 0xa040c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
+    "addvl x15, x15, #4\n"
+    ".inst 0xa040c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
+    "addvl x15, x15, #4\n"
+    "ld1w { z8.s }, p3/Z, [x15]\n"
+    "addvl x15, x15, #1\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z28, z26\n fmla z28.s, p3/M, z8.s, z9.s\n"
+    "movprfx z29, z26\n fmla z29.s, p3/M, z6.s, z9.s\n"
+    "ldr x21, [x16, #0x40]\n"
+    "incw x27\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "ldr x20, [x16, #0x48]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z28.s, p3/M, z1.s, z11.s\n"
+    "fmla z29.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ldr x20, [x16, #0x50]\n"
+    "fmla z28.s, p3/M, z3.s, z14.s\n"
+    "fmla z29.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x16, #0x58]\n"
+    "fmla z28.s, p3/M, z4.s, z15.s\n"
+    "fmla z29.s, p3/M, z4.s, z17.s\n"
+    "ldr x21, [x16, #0x78]\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z29.s, p3/M, z5.s, z18.s\n"
+    "ldr x20, [x16, #0x60]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "movprfx z30, z26\n fmla z30.s, p3/M, z2.s, z9.s\n"
+    "movprfx z31, z26\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ldr x20, [x16, #0x80]\n"
+    "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z28.s, p3/M, z5.s, z20.s\n"
+    "fmla z29.s, p3/M, z3.s, z20.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla z30.s, p3/M, z3.s, z17.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "ldr x20, [x16, #0x88]\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z30.s, p3/M, z0.s, z18.s\n"
+    "fmla z31.s, p3/M, z1.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x16, #0x70]\n"
+    "ldr x20, [x16, #0x98]\n"
+    "fmla z30.s, p3/M, z4.s, z17.s\n"
+    "fmla z31.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x16, #0x90]\n"
+    "fmla z30.s, p3/M, z1.s, z16.s\n"
+    "ldr x20, [x16, #0xa8]\n"
+    "fmla z31.s, p3/M, z2.s, z18.s\n"
+    "fmla z28.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x16, #0xa0]\n"
+    "fmla z30.s, p3/M, z6.s, z16.s\n"
+    "fmla z31.s, p3/M, z3.s, z17.s\n"
+    "ldr x20, [x16, #0xb0]\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "fmla z29.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    "fmla z30.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x16, #0xc0]\n"
+    "fmla z31.s, p3/M, z6.s, z17.s\n"
+    "fmla z29.s, p3/M, z8.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z16.s\n"
+    "mov p0.b, p2.b\n"
+    ".inst 0xc1b8cb3c  // fclamp { z28.s-z31.s }, z25.s, z24.s\n"
+    "st1w { z28.s }, p0, [x14, x27, LSL #2]\n"
+    "st1w { z29.s }, p0, [x13, x27, LSL #2]\n"
+    "st1w { z30.s }, p0, [x11, x27, LSL #2]\n"
+    "st1w { z31.s }, p0, [x10, x27, LSL #2]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp
new file mode 100644
index 0000000000..061b0a1e2e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+);
+
+class sme2_fp32_planar_3x3_s1_4rows_mla_za : public PlanarStrategy<float, float>
+{
+  using Parent = PlanarStrategy<float, float>;
+
+  public:
+  using return_type = float;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_fp32_planar_3x3_s1_4rows_mla_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_fp32_planar_3x3_s1_4rows_mla_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
new file mode 100644
index 0000000000..a385893146
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+)
+{
+  struct Args
+  {
+    const float *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const float *weights;
+    const float *bias;
+    long unsigned int input_cols, output_cols;
+    float **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+    float clamp_min, clamp_max;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "mov x20, #0x6\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "sub x20, x20, x7\n"
+    "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ptrue p2.b\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ld1rw { z2.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+    "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x16\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z24.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+    "whilelt p8.s, XZR, x17\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+    "fmov z20.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z20.s }, p1/Z, [x20, x15, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x14, #0x1\n"
+    "orr x24, x20, %x[ld_in_col], LSL #18\n"
+    "mov z21.d, z20.d\n"
+    "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xa0404ae6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x23]\n"
+    "orr x24, x16, x24, LSL #20\n"
+    "mov x22, #0x6\n"
+    "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ld1w { z10.s }, p2/Z, [x23, #2, MUL VL]\n"
+    "addvl x23, x23, #3\n"
+    "add x21, x17, x7\n"
+    ".inst 0xa1404ae0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x23]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "mov z22.d, z20.d\n"
+    "mov z23.d, z20.d\n"
+    "ld1w { z9.s }, p2/Z, [x23, #2, MUL VL]\n"
+    "addvl x23, x23, #3\n"
+    "mov x8, #0x0\n"
+    "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+    ".inst 0xa0404ae4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x23]\n"
+    "lsl x24, x24, #0x2\n"
+    "sub x22, x22, x21\n"
+    "ld1w { z1.s }, p2/Z, [x23, #2, MUL VL]\n"
+    "madd x20, x20, x17, x13\n"
+    "3:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b84a9c  // rprfm pldstrm, x24, [x20]\n"
+    "add x20, x20, %x[ld_in_col], LSL #2\n"
+    "bgt 3b\n"
+    "ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "msub x13, x17, x20, x13\n"
+    ".inst 0xc0040e80  // mova za.d[x8, #0], { z20.d-z23.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040e81  // mova za.d[x8, #1], { z20.d-z23.d }\n"
+    "mov x10, #0x2\n"
+    "ldp x9, x28, [x22], #0x10\n"
+    ".inst 0xc0040e82  // mova za.d[x8, #2], { z20.d-z23.d }\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    "ldp x25, x24, [x22], #0x10\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "cbz x21, 5f\n"
+    "cmp x21, x10\n"
+    "csel x20, x21, x10, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x10, x10, x20\n"
+    "cbz x21, 5f\n"
+    ".inst 0xc0060c0c  // mova { z12.d-z15.d }, za.d[x8, #0]\n"
+    "sub x11, x11, x21\n"
+    ".inst 0xc1b8c84c  // fclamp { z12.s-z15.s }, z2.s, z24.s\n"
+    "4:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1w { z12.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z13.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "st1w { z14.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "st1w { z15.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 4b\n"
+    "5:"  // Left padding: End
+    "adds XZR, x17, x7\n"
+    "bne 10f\n"
+    "cbz x10, 8f\n"
+    "cmp x10, #0x1\n"
+    "sub x14, x14, x10\n"
+    "beq 7f\n"
+    "6:"  // Unpadded: 2 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z14.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13619c0  // fmla za.s[x8, 0], { z14.s-z17.s }, z6.s\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13019e0  // fmla za.s[x8, 0], { z15.s-z18.s }, z0.s\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    ".inst 0xc1341a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z4.s\n"
+    "7:"  // Unpadded: 1 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z13.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z14.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13719a0  // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
+    ".inst 0xc13619a1  // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
+    "ld1w { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13819c0  // fmla za.s[x8, 0], { z14.s-z17.s }, z8.s\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    ".inst 0xc13019c1  // fmla za.s[x8, 1], { z14.s-z17.s }, z0.s\n"
+    ".inst 0xc13519e0  // fmla za.s[x8, 0], { z15.s-z18.s }, z5.s\n"
+    ".inst 0xc13419e1  // fmla za.s[x8, 1], { z15.s-z18.s }, z4.s\n"
+    "8:"  // Unpadded: 0 priming loads
+    "cbz x14, 16f\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z25.s }, p1/Z, [x13]\n"
+    "sub x14, x14, #0x1\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "sub x11, x11, #0x1\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "cmp x14, x11\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "csel x21, x14, x11, LT\n"
+    "ld1w { z29.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z30.s }, p1/Z, [x20]\n"
+    "sub x11, x11, x21\n"
+    "cbz x21, 15f\n"
+    "9:"  // Unpadded: Main loop
+    ".inst 0xc13a1b20  // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0xc1391b40  // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
+    ".inst 0xc1371b21  // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
+    ".inst 0xc1361b22  // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
+    "ld1w { z25.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1311b60  // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+    ".inst 0xc1381b41  // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
+    ".inst 0xc1301b42  // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0060c0c  // mova { z12.d-z15.d }, za.d[x8, #0]\n"
+    ".inst 0xc1b8c84c  // fclamp { z12.s-z15.s }, z2.s, z24.s\n"
+    "st1w { z12.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc1351b61  // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
+    "st1w { z13.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc1341b62  // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "add x8, x8, #0x1\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "st1w { z14.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "ld1w { z29.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "st1w { z15.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    ".inst 0xc0040e82  // mova za.d[x8, #2], { z20.d-z23.d }\n"
+    "ld1w { z30.s }, p1/Z, [x20]\n"
+    "bgt 9b\n"
+    "b 15f\n"
+    "10:"  // Padded
+    "cbz x10, 13f\n"
+    "cmp x10, #0x1\n"
+    "sub x14, x14, x10\n"
+    "beq 12f\n"
+    "11:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z11.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z12.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z13.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z14.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1361960  // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z15.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1301980  // fmla za.s[x8, 0], { z12.s-z15.s }, z0.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13419a0  // fmla za.s[x8, 0], { z13.s-z16.s }, z4.s\n"
+    "12:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z11.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z12.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z13.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z14.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1371960  // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1361961  // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
+    "ld1w { z15.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1381980  // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1301981  // fmla za.s[x8, 1], { z12.s-z15.s }, z0.s\n"
+    ".inst 0xc13519a0  // fmla za.s[x8, 0], { z13.s-z16.s }, z5.s\n"
+    ".inst 0xc13419a1  // fmla za.s[x8, 1], { z13.s-z16.s }, z4.s\n"
+    "13:"  // Padded: 0 priming loads
+    "cbz x14, 16f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z25.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z28.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "sub x14, x14, #0x1\n"
+    "sub x11, x11, #0x1\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "cmp x14, x11\n"
+    "ld1w { z29.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z30.s }, p0/Z, [x20]\n"
+    "csel x21, x14, x11, LT\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "sub x11, x11, x21\n"
+    "cbz x21, 15f\n"
+    "14:"  // Padded: Main loop
+    ".inst 0xc13a1b20  // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1391b40  // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0xc1371b21  // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
+    ".inst 0xc1361b22  // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
+    "ld1w { z25.s }, p0/Z, [x13]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1311b60  // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+    ".inst 0xc1381b41  // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
+    ".inst 0xc1301b42  // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1b8c850  // fclamp { z16.s-z19.s }, z2.s, z24.s\n"
+    "st1w { z16.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc1351b61  // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
+    "st1w { z17.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc1341b62  // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1w { z28.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "st1w { z18.s }, p1, [x25]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "add x8, x8, #0x1\n"
+    "ld1w { z29.s }, p0/Z, [x20]\n"
+    "st1w { z19.s }, p1, [x24]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc0040e82  // mova za.d[x8, #2], { z20.d-z23.d }\n"
+    "ld1w { z30.s }, p0/Z, [x20]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 14b\n"
+    "15:"  // Main loop tail
+    ".inst 0xc13a1b20  // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+    ".inst 0xc1391b40  // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
+    ".inst 0xc1371b21  // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
+    ".inst 0xc1361b22  // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
+    ".inst 0xc1311b60  // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+    ".inst 0xc1381b41  // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
+    ".inst 0xc1301b42  // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1b8c850  // fclamp { z16.s-z19.s }, z2.s, z24.s\n"
+    "st1w { z16.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc1351b61  // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
+    "st1w { z17.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc1341b62  // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
+    "add x8, x8, #0x1\n"
+    "st1w { z18.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "st1w { z19.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    ".inst 0xc0040e82  // mova za.d[x8, #2], { z20.d-z23.d }\n"
+    "16:"  // Main loop skip tail
+    "cbz x11, 18f\n"
+    "17:"  // Right padding loop
+    ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "subs x11, x11, #0x1\n"
+    ".inst 0xc1b8c848  // fclamp { z8.s-z11.s }, z2.s, z24.s\n"
+    "st1w { z8.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc0040e82  // mova za.d[x8, #2], { z20.d-z23.d }\n"
+    "st1w { z9.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "st1w { z10.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "st1w { z11.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 17b\n"
+    "18:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incb x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x15\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x15, x16\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21, LSL #2\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp
new file mode 100644
index 0000000000..711f7f479a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+);
+
+class sme2_fp32_planar_3x3_s2_4rows_mla_za : public PlanarStrategy<float, float>
+{
+  using Parent = PlanarStrategy<float, float>;
+
+  public:
+  using return_type = float;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_fp32_planar_3x3_s2_4rows_mla_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_fp32_planar_3x3_s2_4rows_mla_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
new file mode 100644
index 0000000000..26315101b4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
@@ -0,0 +1,650 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+)
+{
+  struct Args
+  {
+    const float *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const float *weights;
+    const float *bias;
+    long unsigned int input_cols, output_cols;
+    float **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+    float clamp_min, clamp_max;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "mov x20, #0x9\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "sub x20, x20, x7\n"
+    "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ptrue p2.b\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ld1rw { z7.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+    "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x16\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z9.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+    "whilelt p8.s, XZR, x17\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+    "fmov z12.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z12.s }, p1/Z, [x20, x15, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x14, #0x1\n"
+    "orr x24, x20, %x[ld_in_col], LSL #18\n"
+    "mov z13.d, z12.d\n"
+    "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xa1404ae2  // ld1w { z2.s, z10.s }, pn10.b/Z, [x23]\n"
+    "orr x24, x16, x24, LSL #20\n"
+    "mov x22, #0x9\n"
+    "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ld1w { z8.s }, p2/Z, [x23, #2, MUL VL]\n"
+    "addvl x23, x23, #3\n"
+    "add x21, x17, x7\n"
+    ".inst 0xa0404ae0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x23]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "mov z14.d, z12.d\n"
+    "mov z15.d, z12.d\n"
+    "ld1w { z5.s }, p2/Z, [x23, #2, MUL VL]\n"
+    "addvl x23, x23, #3\n"
+    "mov x8, #0x0\n"
+    "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+    ".inst 0xa1404ae3  // ld1w { z3.s, z11.s }, pn10.b/Z, [x23]\n"
+    "lsl x24, x24, #0x2\n"
+    "sub x22, x22, x21\n"
+    "ld1w { z6.s }, p2/Z, [x23, #2, MUL VL]\n"
+    "madd x20, x20, x17, x13\n"
+    "3:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b84a9c  // rprfm pldstrm, x24, [x20]\n"
+    "add x20, x20, %x[ld_in_col], LSL #2\n"
+    "bgt 3b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "msub x13, x17, x20, x13\n"
+    ".inst 0xc0040d80  // mova za.d[x8, #0], { z12.d-z15.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040d81  // mova za.d[x8, #1], { z12.d-z15.d }\n"
+    "mov x22, #0x2\n"
+    "ldp x10, x9, [x23], #0x10\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "ldp x28, x27, [x20], #0x10\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    "ldp x26, x25, [x23], #0x10\n"
+    "ldp x24, x23, [x20], #0x10\n"
+    "cbz x21, 5f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 5f\n"
+    ".inst 0xc0060c14  // mova { z20.d-z23.d }, za.d[x8, #0]\n"
+    "and x22, x21, #0x1\n"
+    "add x21, x21, #0x1\n"
+    ".inst 0xc1a9c8f4  // fclamp { z20.s-z23.s }, z7.s, z9.s\n"
+    "lsr x21, x21, #0x1\n"
+    "sub x11, x11, x21\n"
+    "4:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1w { z20.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z21.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z22.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    "st1w { z23.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "bgt 4b\n"
+    "5:"  // Left padding: End
+    "adds XZR, x17, x7\n"
+    "bne 10f\n"
+    "cbz x22, 8f\n"
+    "cmp x22, #0x1\n"
+    "sub x14, x14, x22\n"
+    "beq 7f\n"
+    "6:"  // Unpadded: 2 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1321a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z2.s\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1301b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z0.s\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    ".inst 0xc1331a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z3.s\n"
+    "7:"  // Unpadded: 1 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z26.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z29.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13a1b40  // fmla za.s[x8, 0], { z26.s-z29.s }, z10.s\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1311a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z1.s\n"
+    "ld1w { z30.s }, p1/Z, [x20]\n"
+    ".inst 0xc13b1b60  // fmla za.s[x8, 0], { z27.s-z30.s }, z11.s\n"
+    "8:"  // Unpadded: 0 priming loads
+    "cmp x14, #0x2\n"
+    "blt 16f\n"
+    "add x21, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z25.s }, p1/Z, [x13]\n"
+    "sub x14, x14, #0x2\n"
+    "ld1w { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "sub x11, x11, #0x1\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "lsr x20, x14, #0x1\n"
+    "ld1w { z20.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "cmp x20, x11\n"
+    "ld1w { z27.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "csel x22, x20, x11, LT\n"
+    "ld1w { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "and x14, x14, #0x1\n"
+    "ld1w { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "sub x11, x11, x22\n"
+    "ld1w { z29.s }, p1/Z, [x21]\n"
+    "cbz x22, 15f\n"
+    "9:"  // Unpadded: Main loop
+    ".inst 0xc1381b20  // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
+    "add x21, x13, %x[ld_in_row], LSL #2\n"
+    "subs x22, x22, #0x1\n"
+    ".inst 0xc1321b21  // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+    "ld1w { z25.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1351a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+    ".inst 0xc1301a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
+    "ld1w { z18.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1361b40  // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
+    ".inst 0xc1331b41  // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0060c14  // mova { z20.d-z23.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "ld1w { z27.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1a9c8f4  // fclamp { z20.s-z23.s }, z7.s, z9.s\n"
+    "st1w { z20.s }, p1, [x10]\n"
+    "ld1w { z20.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z21.s }, p1, [x9]\n"
+    "ld1w { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13a1b20  // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+    "add x9, x9, x27, LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1311a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+    "st1w { z22.s }, p1, [x26]\n"
+    "ld1w { z29.s }, p1/Z, [x21]\n"
+    ".inst 0xc13b1b40  // fmla za.s[x8, 0], { z26.s-z29.s }, z11.s\n"
+    "add x26, x26, x24, LSL #2\n"
+    "st1w { z23.s }, p1, [x25]\n"
+    "ld1w { z25.s }, p1/Z, [x13]\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z29.s }, p1/Z, [x20]\n"
+    "bgt 9b\n"
+    "b 15f\n"
+    "10:"  // Padded
+    "cbz x22, 13f\n"
+    "cmp x22, #0x1\n"
+    "sub x14, x14, x22\n"
+    "beq 12f\n"
+    "11:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z27.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z28.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z29.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z30.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1321b60  // fmla za.s[x8, 0], { z27.s-z30.s }, z2.s\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1301ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
+    "ld1w { z31.s }, p0/Z, [x20]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1331b80  // fmla za.s[x8, 0], { z28.s-z31.s }, z3.s\n"
+    "12:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z22.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z28.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z29.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc13a1ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
+    "ld1w { z30.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1311b60  // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13b1ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z11.s\n"
+    "13:"  // Padded: 0 priming loads
+    "cmp x14, #0x2\n"
+    "blt 16f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z25.s }, p0/Z, [x13]\n"
+    "add x21, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z19.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z26.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z20.s }, p0/Z, [x21]\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "sub x14, x14, #0x2\n"
+    "ld1w { z21.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z28.s }, p0/Z, [x21]\n"
+    "sub x11, x11, #0x1\n"
+    "lsr x20, x14, #0x1\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z22.s }, p0/Z, [x21]\n"
+    "mov x12, #0x8\n"
+    "cmp x20, x11\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z29.s }, p0/Z, [x21]\n"
+    "csel x22, x20, x11, LT\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "and x14, x14, #0x1\n"
+    "sub x11, x11, x22\n"
+    "cbz x22, 15f\n"
+    "14:"  // Padded: Main loop
+    ".inst 0xc1381b20  // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1321b21  // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+    "ld1w { z18.s }, p0/Z, [x13]\n"
+    "add x21, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1351a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1301a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
+    "ld1w { z25.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1361b40  // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
+    "subs x22, x22, #0x1\n"
+    ".inst 0xc1331b41  // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
+    "ld1w { z19.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1w { z26.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z20.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1a9c8fc  // fclamp { z28.s-z31.s }, z7.s, z9.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "st1w { z28.s }, p1, [x10]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z21.s }, p0/Z, [x21]\n"
+    "add x8, x8, #0x1\n"
+    "st1w { z29.s }, p1, [x9]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z28.s }, p0/Z, [x21]\n"
+    "st1w { z30.s }, p1, [x26]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc13a1a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "st1w { z31.s }, p1, [x25]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1311b20  // fmla za.s[x8, 0], { z25.s-z28.s }, z1.s\n"
+    "mov x12, #0x0\n"
+    "ld1w { z22.s }, p0/Z, [x21]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z25.s }, p0/Z, [x13]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc13b1a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z28.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z29.s }, p0/Z, [x20]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "add x9, x9, x27, LSL #2\n"
+    "add x26, x26, x24, LSL #2\n"
+    "add x25, x25, x23, LSL #2\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "bgt 14b\n"
+    "15:"  // Main loop tail
+    ".inst 0xc1381b20  // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1321b21  // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+    "ld1w { z18.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1351a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1301a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1361b40  // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
+    ".inst 0xc1331b41  // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1a9c8fc  // fclamp { z28.s-z31.s }, z7.s, z9.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "st1w { z28.s }, p1, [x10]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "add x8, x8, #0x1\n"
+    "st1w { z29.s }, p1, [x9]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "st1w { z30.s }, p1, [x26]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc13a1a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "st1w { z31.s }, p1, [x25]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1311ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "add x9, x9, x27, LSL #2\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc13b1a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
+    "16:"  // Main loop skip tail
+    "cbz x14, 17f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z21.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z28.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z29.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z30.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1381aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z8.s\n"
+    "ld1w { z31.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1351b80  // fmla za.s[x8, 0], { z28.s-z31.s }, z5.s\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    ".inst 0xc1321aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
+    "sub x11, x11, #0x1\n"
+    ".inst 0xc1361ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z6.s\n"
+    ".inst 0xc1301b81  // fmla za.s[x8, 1], { z28.s-z31.s }, z0.s\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a9c8f0  // fclamp { z16.s-z19.s }, z7.s, z9.s\n"
+    "st1w { z16.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    ".inst 0xc1331ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z3.s\n"
+    "add x8, x8, #0x1\n"
+    "st1w { z17.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z18.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "st1w { z19.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "17:"  // Tail input: End
+    "cbz x11, 19f\n"
+    "18:"  // Right padding loop
+    ".inst 0xc0060c00  // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "subs x11, x11, #0x1\n"
+    ".inst 0xc1a9c8e0  // fclamp { z0.s-z3.s }, z7.s, z9.s\n"
+    "st1w { z0.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "st1w { z1.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z2.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    "st1w { z3.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "bgt 18b\n"
+    "19:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incb x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x15\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x15, x16\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21, LSL #2\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp
new file mode 100644
index 0000000000..71487e08b6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+);
+
+class sme2_fp32_planar_5x5_s1_4rows_mla_za : public PlanarStrategy<float, float>
+{
+  using Parent = PlanarStrategy<float, float>;
+
+  public:
+  using return_type = float;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_fp32_planar_5x5_s1_4rows_mla_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_fp32_planar_5x5_s1_4rows_mla_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
new file mode 100644
index 0000000000..3741b973b4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
@@ -0,0 +1,883 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+)
+{
+  struct Args
+  {
+    const float *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const float *weights;
+    const float *bias;
+    long unsigned int input_cols, output_cols;
+    float **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+    float clamp_min, clamp_max;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+  __asm__ __volatile__(
+    "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "mov x20, #0x8\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "sub x20, x20, x6\n"
+    "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ptrue p2.b\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ld1rw { z16.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+    "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x17\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z17.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+    "whilelt p8.s, XZR, x7\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+    "fmov z28.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x15, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #18\n"
+    "mov z29.d, z28.d\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "orr x23, x17, x23, LSL #20\n"
+    "mov x22, #0x8\n"
+    "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "add x21, x7, x6\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "mov z30.d, z28.d\n"
+    "mov z31.d, z28.d\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    "mov x8, #0x0\n"
+    "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "lsl x23, x23, #0x2\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x7, x13\n"
+    "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "3:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col], LSL #2\n"
+    "bgt 3b\n"
+    "ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "msub x13, x7, x20, x13\n"
+    ".inst 0xc0040f80  // mova za.d[x8, #0], { z28.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040f81  // mova za.d[x8, #1], { z28.d-z31.d }\n"
+    "mov x10, #0x4\n"
+    "ldp x9, x28, [x22], #0x10\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    ".inst 0xc0040f83  // mova za.d[x8, #3], { z28.d-z31.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    "ldp x25, x24, [x22], #0x10\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "cbz x21, 5f\n"
+    "cmp x21, x10\n"
+    "csel x20, x21, x10, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x10, x10, x20\n"
+    "cbz x21, 5f\n"
+    ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+    "sub x11, x11, x21\n"
+    ".inst 0xc1b1ca04  // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
+    "4:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1w { z4.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z5.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "st1w { z6.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "st1w { z7.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 4b\n"
+    "5:"  // Left padding: End
+    "adds XZR, x7, x6\n"
+    "bne 12f\n"
+    "cbz x10, 10f\n"
+    "cmp x10, #0x1\n"
+    "sub x15, x15, x10\n"
+    "beq 9f\n"
+    "cmp x10, #0x2\n"
+    "beq 8f\n"
+    "cmp x10, #0x3\n"
+    "beq 7f\n"
+    "6:"  // Unpadded: 4 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z18.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z14.s\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13a1a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z10.s\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa14049c5  // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1351a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z5.s\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa04049cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc13c1aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z12.s\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    ".inst 0xa14049c1  // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1311ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    "7:"  // Unpadded: 3 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13f1a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z15.s\n"
+    ".inst 0xc13e1a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z14.s\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13b1aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z11.s\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13a1aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z10.s\n"
+    ".inst 0xa04049c6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1371ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z7.s\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1361ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z6.s\n"
+    ".inst 0xa14049c5  // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc13d1ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z13.s\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    ".inst 0xc1351ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z5.s\n"
+    ".inst 0xa04049c6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1371b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z7.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc1361b01  // fmla za.s[x8, 1], { z24.s-z27.s }, z6.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z1.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z2.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z3.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z4.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13c1820  // fmla za.s[x8, 0], { z1.s-z4.s }, z12.s\n"
+    ".inst 0xc13f1821  // fmla za.s[x8, 1], { z1.s-z4.s }, z15.s\n"
+    "ld1w { z5.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1822  // fmla za.s[x8, 2], { z1.s-z4.s }, z14.s\n"
+    "ld1w { z6.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1381840  // fmla za.s[x8, 0], { z2.s-z5.s }, z8.s\n"
+    ".inst 0xa04049cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xc13b1841  // fmla za.s[x8, 1], { z2.s-z5.s }, z11.s\n"
+    ".inst 0xa04149ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc13a1842  // fmla za.s[x8, 2], { z2.s-z5.s }, z10.s\n"
+    "ld1w { z7.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1860  // fmla za.s[x8, 0], { z3.s-z6.s }, z14.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xc13d1861  // fmla za.s[x8, 1], { z3.s-z6.s }, z13.s\n"
+    ".inst 0xa14149c0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc13c1862  // fmla za.s[x8, 2], { z3.s-z6.s }, z12.s\n"
+    "ld1w { z8.s }, p1/Z, [x20]\n"
+    ".inst 0xc1301880  // fmla za.s[x8, 0], { z4.s-z7.s }, z0.s\n"
+    ".inst 0xa04049c0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xc13f1881  // fmla za.s[x8, 1], { z4.s-z7.s }, z15.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13e1882  // fmla za.s[x8, 2], { z4.s-z7.s }, z14.s\n"
+    ".inst 0xc13c18a0  // fmla za.s[x8, 0], { z5.s-z8.s }, z12.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13118a1  // fmla za.s[x8, 1], { z5.s-z8.s }, z1.s\n"
+    ".inst 0xc13018a2  // fmla za.s[x8, 2], { z5.s-z8.s }, z0.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13d1a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z13.s\n"
+    ".inst 0xc13c1a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z12.s\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13f1a82  // fmla za.s[x8, 2], { z20.s-z23.s }, z15.s\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1a83  // fmla za.s[x8, 3], { z20.s-z23.s }, z14.s\n"
+    ".inst 0xa04049c4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xc1391aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z9.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1381aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13b1aa2  // fmla za.s[x8, 2], { z21.s-z24.s }, z11.s\n"
+    ".inst 0xa14149c6  // ld1w { z6.s, z14.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13a1aa3  // fmla za.s[x8, 3], { z21.s-z24.s }, z10.s\n"
+    ".inst 0xa14049c1  // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc13d1ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z13.s\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    ".inst 0xc13c1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z12.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc1351ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z5.s\n"
+    ".inst 0xc1341ac3  // fmla za.s[x8, 3], { z22.s-z25.s }, z4.s\n"
+    ".inst 0xa04049c4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13e1ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z14.s\n"
+    "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1361ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z6.s\n"
+    ".inst 0xc1391ae2  // fmla za.s[x8, 2], { z23.s-z26.s }, z9.s\n"
+    ".inst 0xc1311ae3  // fmla za.s[x8, 3], { z23.s-z26.s }, z1.s\n"
+    ".inst 0xc13d1b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z13.s\n"
+    ".inst 0xc13c1b01  // fmla za.s[x8, 1], { z24.s-z27.s }, z12.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc1351b02  // fmla za.s[x8, 2], { z24.s-z27.s }, z5.s\n"
+    ".inst 0xc1341b03  // fmla za.s[x8, 3], { z24.s-z27.s }, z4.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "10:"  // Unpadded: 0 priming loads
+    "cbz x15, 20f\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z18.s }, p1/Z, [x13]\n"
+    "sub x15, x15, #0x1\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "sub x11, x11, #0x1\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "cmp x15, x11\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "csel x21, x15, x11, LT\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "sub x11, x11, x21\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    "cbz x21, 19f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc1321a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
+    "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0xc1331a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
+    ".inst 0xc13d1a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
+    ".inst 0xc13c1a42  // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13f1a43  // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
+    ".inst 0xc13e1a44  // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
+    ".inst 0xa04049c4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1361a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z6.s\n"
+    "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1391a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
+    "ld1w { z18.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1381a62  // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
+    ".inst 0xa04149ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13b1a63  // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
+    ".inst 0xc13a1a64  // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
+    ".inst 0xa14049c0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1361aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z6.s\n"
+    "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc13d1a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z13.s\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13c1a82  // fmla za.s[x8, 2], { z20.s-z23.s }, z12.s\n"
+    ".inst 0xa04149c6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc1351a83  // fmla za.s[x8, 3], { z20.s-z23.s }, z5.s\n"
+    ".inst 0xc1341a84  // fmla za.s[x8, 4], { z20.s-z23.s }, z4.s\n"
+    ".inst 0xa04049c4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1321ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z2.s\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13f1aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z15.s\n"
+    "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc13e1aa2  // fmla za.s[x8, 2], { z21.s-z24.s }, z14.s\n"
+    ".inst 0xc1381aa3  // fmla za.s[x8, 3], { z21.s-z24.s }, z8.s\n"
+    ".inst 0xc1301aa4  // fmla za.s[x8, 4], { z21.s-z24.s }, z0.s\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0060c0c  // mova { z12.d-z15.d }, za.d[x8, #0]\n"
+    ".inst 0xc1b1ca0c  // fclamp { z12.s-z15.s }, z16.s, z17.s\n"
+    "st1w { z12.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc1371ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z7.s\n"
+    "st1w { z13.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc1361ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z6.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "st1w { z14.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc1351ac3  // fmla za.s[x8, 3], { z22.s-z25.s }, z5.s\n"
+    "st1w { z15.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    ".inst 0xc1341ac4  // fmla za.s[x8, 4], { z22.s-z25.s }, z4.s\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "add x8, x8, #0x1\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    "bgt 11b\n"
+    "b 19f\n"
+    "12:"  // Padded
+    "cbz x10, 17f\n"
+    "cmp x10, #0x1\n"
+    "sub x15, x15, x10\n"
+    "beq 16f\n"
+    "cmp x10, #0x2\n"
+    "beq 15f\n"
+    "cmp x10, #0x3\n"
+    "beq 14f\n"
+    "13:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z19.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z14.s\n"
+    ".inst 0xa14049c1  // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13a1a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z10.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1311aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z1.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "addvl x14, x14, #5\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    ".inst 0xa04049c6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc1361ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z6.s\n"
+    "addvl x14, x14, #5\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xa14049c0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1301ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    "14:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z0.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z1.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z2.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z3.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13f1800  // fmla za.s[x8, 0], { z0.s-z3.s }, z15.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13e1801  // fmla za.s[x8, 1], { z0.s-z3.s }, z14.s\n"
+    "ld1w { z4.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc13b1820  // fmla za.s[x8, 0], { z1.s-z4.s }, z11.s\n"
+    "ld1w { z5.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13a1821  // fmla za.s[x8, 1], { z1.s-z4.s }, z10.s\n"
+    ".inst 0xa04049c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1391840  // fmla za.s[x8, 0], { z2.s-z5.s }, z9.s\n"
+    "ld1w { z6.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc1381841  // fmla za.s[x8, 1], { z2.s-z5.s }, z8.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13f1860  // fmla za.s[x8, 0], { z3.s-z6.s }, z15.s\n"
+    "ld1w { z7.s }, p0/Z, [x20]\n"
+    ".inst 0xc13e1861  // fmla za.s[x8, 1], { z3.s-z6.s }, z14.s\n"
+    ".inst 0xa14049c3  // ld1w { z3.s, z11.s }, pn10.b/Z, [x14]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13b1880  // fmla za.s[x8, 0], { z4.s-z7.s }, z11.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc1331881  // fmla za.s[x8, 1], { z4.s-z7.s }, z3.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "15:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z19.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13c1a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z12.s\n"
+    ".inst 0xa04149c6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13f1a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z15.s\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1a62  // fmla za.s[x8, 2], { z19.s-z22.s }, z14.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1381a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z8.s\n"
+    ".inst 0xa14049c0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc13b1a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z11.s\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc13a1a82  // fmla za.s[x8, 2], { z20.s-z23.s }, z10.s\n"
+    ".inst 0xa14049c2  // ld1w { z2.s, z10.s }, pn10.b/Z, [x14]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1361aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z6.s\n"
+    ".inst 0xa14149c4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1381aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    ".inst 0xc1301aa2  // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
+    ".inst 0xa04049c6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xc1341ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z4.s\n"
+    ".inst 0xa14149c3  // ld1w { z3.s, z11.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13a1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+    ".inst 0xc1321ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z2.s\n"
+    ".inst 0xc1331ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z3.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc1371ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z7.s\n"
+    ".inst 0xc1361ae2  // fmla za.s[x8, 2], { z23.s-z26.s }, z6.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "16:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z18.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13d1a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z13.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13c1a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z12.s\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13f1a42  // fmla za.s[x8, 2], { z18.s-z21.s }, z15.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1a43  // fmla za.s[x8, 3], { z18.s-z21.s }, z14.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1391a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z9.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1381a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z8.s\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc13b1a62  // fmla za.s[x8, 2], { z19.s-z22.s }, z11.s\n"
+    ".inst 0xa14149c0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13a1a63  // fmla za.s[x8, 3], { z19.s-z22.s }, z10.s\n"
+    ".inst 0xa14049c1  // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc13d1a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z13.s\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    ".inst 0xc13c1a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z12.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13f1a82  // fmla za.s[x8, 2], { z20.s-z23.s }, z15.s\n"
+    ".inst 0xc13e1a83  // fmla za.s[x8, 3], { z20.s-z23.s }, z14.s\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1381aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z8.s\n"
+    "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1301aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z0.s\n"
+    ".inst 0xc1391aa2  // fmla za.s[x8, 2], { z21.s-z24.s }, z9.s\n"
+    ".inst 0xc1311aa3  // fmla za.s[x8, 3], { z21.s-z24.s }, z1.s\n"
+    ".inst 0xc13d1ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z13.s\n"
+    ".inst 0xc13c1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z12.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13b1ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z11.s\n"
+    ".inst 0xc13a1ac3  // fmla za.s[x8, 3], { z22.s-z25.s }, z10.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "17:"  // Padded: 0 priming loads
+    "cbz x15, 20f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z18.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "sub x15, x15, #0x1\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "sub x11, x11, #0x1\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "cmp x15, x11\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "csel x21, x15, x11, LT\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "sub x11, x11, x21\n"
+    "cbz x21, 19f\n"
+    "18:"  // Padded: Main loop
+    ".inst 0xc1321a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
+    "ld1w { z0.s }, p2/Z, [x14, #4, MUL VL]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1331a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0xc13d1a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
+    ".inst 0xc13c1a42  // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
+    ".inst 0xa04149c2  // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13f1a43  // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
+    ".inst 0xc13e1a44  // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
+    ".inst 0xa04049c4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1301a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z0.s\n"
+    "ld1w { z12.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1391a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
+    "ld1w { z18.s }, p0/Z, [x13]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1381a62  // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
+    ".inst 0xa14149c0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13b1a63  // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
+    ".inst 0xc13a1a64  // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
+    ".inst 0xa04049c6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc13c1aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1331a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z3.s\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1321a82  // fmla za.s[x8, 2], { z20.s-z23.s }, z2.s\n"
+    ".inst 0xa14149c3  // ld1w { z3.s, z11.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc1351a83  // fmla za.s[x8, 3], { z20.s-z23.s }, z5.s\n"
+    ".inst 0xc1341a84  // fmla za.s[x8, 4], { z20.s-z23.s }, z4.s\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    ".inst 0xc13c1ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z12.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1381aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+    "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1301aa2  // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
+    ".inst 0xc1371aa3  // fmla za.s[x8, 3], { z21.s-z24.s }, z7.s\n"
+    ".inst 0xc1361aa4  // fmla za.s[x8, 4], { z21.s-z24.s }, z6.s\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+    ".inst 0xc1b1ca04  // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
+    "st1w { z4.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc13b1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z11.s\n"
+    "st1w { z5.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc1331ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z3.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "st1w { z6.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc13f1ac3  // fmla za.s[x8, 3], { z22.s-z25.s }, z15.s\n"
+    "st1w { z7.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    ".inst 0xc13e1ac4  // fmla za.s[x8, 4], { z22.s-z25.s }, z14.s\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "bgt 18b\n"
+    "19:"  // Main loop tail
+    ".inst 0xc1321a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
+    "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1331a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
+    ".inst 0xc13d1a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
+    ".inst 0xc13c1a42  // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
+    ".inst 0xa04149c4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13f1a43  // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
+    ".inst 0xc13e1a44  // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
+    ".inst 0xa04049c2  // ld1w { z2.s-z3.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1361a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z6.s\n"
+    "ld1w { z7.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1391a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
+    ".inst 0xc1381a62  // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
+    ".inst 0xa14149c0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13b1a63  // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
+    ".inst 0xc13a1a64  // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1371aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z7.s\n"
+    "ld1w { z1.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1351a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z5.s\n"
+    ".inst 0xc1341a82  // fmla za.s[x8, 2], { z20.s-z23.s }, z4.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc1331a83  // fmla za.s[x8, 3], { z20.s-z23.s }, z3.s\n"
+    ".inst 0xc1321a84  // fmla za.s[x8, 4], { z20.s-z23.s }, z2.s\n"
+    ".inst 0xa04049c2  // ld1w { z2.s-z3.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xc1311ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
+    ".inst 0xc1381aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+    ".inst 0xc1301aa2  // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
+    ".inst 0xc13b1aa3  // fmla za.s[x8, 3], { z21.s-z24.s }, z11.s\n"
+    ".inst 0xc13a1aa4  // fmla za.s[x8, 4], { z21.s-z24.s }, z10.s\n"
+    ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+    ".inst 0xc1b1ca04  // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
+    "st1w { z4.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc13d1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z13.s\n"
+    "st1w { z5.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc13c1ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z12.s\n"
+    "st1w { z6.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc1331ac3  // fmla za.s[x8, 3], { z22.s-z25.s }, z3.s\n"
+    "st1w { z7.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    ".inst 0xc1321ac4  // fmla za.s[x8, 4], { z22.s-z25.s }, z2.s\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    "20:"  // Main loop skip tail
+    "cbz x11, 22f\n"
+    "21:"  // Right padding loop
+    ".inst 0xc0060c00  // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "subs x11, x11, #0x1\n"
+    ".inst 0xc1b1ca00  // fclamp { z0.s-z3.s }, z16.s, z17.s\n"
+    "st1w { z0.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    "st1w { z1.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "st1w { z2.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "st1w { z3.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 21b\n"
+    "22:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incb x20, ALL, MUL #16\n"
+    "incb x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x16\n"
+    "whilelt p1.s, x16, x17\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21, LSL #2\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp
new file mode 100644
index 0000000000..7412c7b57c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+);
+
+class sme2_fp32_planar_5x5_s2_4rows_mla_za : public PlanarStrategy<float, float>
+{
+  using Parent = PlanarStrategy<float, float>;
+
+  public:
+  using return_type = float;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_fp32_planar_5x5_s2_4rows_mla_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_fp32_planar_5x5_s2_4rows_mla_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
new file mode 100644
index 0000000000..81ad8e5833
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
@@ -0,0 +1,1172 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+)
+{
+  struct Args
+  {
+    const float *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const float *weights;
+    const float *bias;
+    long unsigned int input_cols, output_cols;
+    float **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+    float clamp_min, clamp_max;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 11u - std::min(11u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+  __asm__ __volatile__(
+    "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "mov x20, #0xb\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "sub x20, x20, x5\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ptrue p2.b\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ld1rw { z2.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+    "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x7\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z3.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+    "whilelt p8.s, XZR, x6\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+    "fmov z28.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z28.s }, p1/Z, [x20, x17, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x16, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x16, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #18\n"
+    "mov z29.d, z28.d\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    "orr x23, x7, x23, LSL #20\n"
+    "mov x22, #0xb\n"
+    "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "add x21, x6, x5\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "mov z30.d, z28.d\n"
+    "mov z31.d, z28.d\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    "mov x8, #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "lsl x23, x23, #0x2\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x6, x14\n"
+    "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "3:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col], LSL #2\n"
+    "bgt 3b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "msub x14, x6, x20, x14\n"
+    ".inst 0xc0040f80  // mova za.d[x8, #0], { z28.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040f81  // mova za.d[x8, #1], { z28.d-z31.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x11, x10, [x23], #0x10\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    ".inst 0xc0040f83  // mova za.d[x8, #3], { z28.d-z31.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    "ldp x27, x26, [x23], #0x10\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "cbz x21, 5f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 5f\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "and x22, x21, #0x1\n"
+    "add x21, x21, #0x1\n"
+    ".inst 0xc1a3c850  // fclamp { z16.s-z19.s }, z2.s, z3.s\n"
+    "lsr x21, x21, #0x1\n"
+    "sub x13, x13, x21\n"
+    "4:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1w { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9, LSL #2\n"
+    "st1w { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25, LSL #2\n"
+    "st1w { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    "bgt 4b\n"
+    "5:"  // Left padding: End
+    "adds XZR, x6, x5\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x16, x16, x22\n"
+    "beq 9f\n"
+    "cmp x22, #0x2\n"
+    "beq 8f\n"
+    "cmp x22, #0x3\n"
+    "beq 7f\n"
+    "6:"  // Unpadded: 4 priming loads
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    "ld1w { z9.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z10.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z11.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z12.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1341920  // fmla za.s[x8, 0], { z9.s-z12.s }, z4.s\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1371a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z7.s\n"
+    "ld1w { z13.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1341940  // fmla za.s[x8, 0], { z10.s-z13.s }, z4.s\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa04049e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1301aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z0.s\n"
+    "ld1w { z14.s }, p1/Z, [x20]\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1341960  // fmla za.s[x8, 0], { z11.s-z14.s }, z4.s\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    "7:"  // Unpadded: 3 priming loads
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    "ld1w { z7.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z8.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z9.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1351ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z5.s\n"
+    "ld1w { z10.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13f18e0  // fmla za.s[x8, 0], { z7.s-z10.s }, z15.s\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa04049ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13f1ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z15.s\n"
+    "ld1w { z11.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa04049e6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1371900  // fmla za.s[x8, 0], { z8.s-z11.s }, z7.s\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    ".inst 0xa04049ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13b1b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z11.s\n"
+    ".inst 0xa14049e4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    "ld1w { z14.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13a1a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z10.s\n"
+    ".inst 0xc1341a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z4.s\n"
+    "ld1w { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13019c0  // fmla za.s[x8, 0], { z14.s-z17.s }, z0.s\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13719c1  // fmla za.s[x8, 1], { z14.s-z17.s }, z7.s\n"
+    ".inst 0xa04049e8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13a1a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z10.s\n"
+    ".inst 0xc1381a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z8.s\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa04049e6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc13619e1  // fmla za.s[x8, 1], { z15.s-z18.s }, z6.s\n"
+    ".inst 0xa04149e8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13819e0  // fmla za.s[x8, 0], { z15.s-z18.s }, z8.s\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    ".inst 0xa04049ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc13e1aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z14.s\n"
+    ".inst 0xa14149e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1371aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z7.s\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa04049ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    "ld1w { z7.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z8.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z9.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z10.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13b18e0  // fmla za.s[x8, 0], { z7.s-z10.s }, z11.s\n"
+    ".inst 0xc13518e1  // fmla za.s[x8, 1], { z7.s-z10.s }, z5.s\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1311a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+    "ld1w { z11.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13f1a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z15.s\n"
+    ".inst 0xa04049e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ec  // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13d1900  // fmla za.s[x8, 0], { z8.s-z11.s }, z13.s\n"
+    ".inst 0xc1311901  // fmla za.s[x8, 1], { z8.s-z11.s }, z1.s\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa14049e6  // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc13e1a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z14.s\n"
+    ".inst 0xa14149e6  // ld1w { z6.s, z14.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13e1a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z14.s\n"
+    "ld1w { z12.s }, p1/Z, [x20]\n"
+    ".inst 0xa04049ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc13f1921  // fmla za.s[x8, 1], { z9.s-z12.s }, z15.s\n"
+    ".inst 0xa04149ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13f1920  // fmla za.s[x8, 0], { z9.s-z12.s }, z15.s\n"
+    ".inst 0xa14049e4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "10:"  // Unpadded: 0 priming loads
+    "cmp x16, #0x2\n"
+    "blt 20f\n"
+    "add x21, x14, %x[ld_in_row], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x14]\n"
+    "sub x16, x16, #0x2\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "sub x13, x13, #0x1\n"
+    "ld1w { z23.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "lsr x20, x16, #0x1\n"
+    "ld1w { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "cmp x20, x13\n"
+    "ld1w { z24.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "csel x23, x20, x13, LT\n"
+    "ld1w { z18.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    "ld1w { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "and x16, x16, #0x1\n"
+    "ld1w { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "sub x13, x13, x23\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x21]\n"
+    "cbz x23, 19f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc1391ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
+    "ld1w { z13.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "add x22, x14, %x[ld_in_row], LSL #2\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0xc13a1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+    ".inst 0xa14149e1  // ld1w { z1.s, z9.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    ".inst 0xc1341ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
+    ".inst 0xa04049e8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1361a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
+    "ld1w { z11.s }, p2/Z, [x15, #4, MUL VL]\n"
+    ".inst 0xc1301a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
+    ".inst 0xa04149ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    ".inst 0xc1371a02  // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
+    ".inst 0xa04049e6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13d1ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z13.s\n"
+    "ld1w { z4.s }, p2/Z, [x15, #4, MUL VL]\n"
+    ".inst 0xc1311ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z1.s\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    ".inst 0xc1381ae2  // fmla za.s[x8, 2], { z23.s-z26.s }, z8.s\n"
+    ".inst 0xa04049ec  // ld1w { z12.s-z13.s }, pn10.b/Z, [x15]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13b1a20  // fmla za.s[x8, 0], { z17.s-z20.s }, z11.s\n"
+    "ld1w { z15.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1a21  // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
+    "ld1w { z22.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1361a22  // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
+    "ld1w { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    "ld1w { z23.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1341b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z4.s\n"
+    ".inst 0xc1301b01  // fmla za.s[x8, 1], { z24.s-z27.s }, z0.s\n"
+    ".inst 0xa0414aa6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc13c1b02  // fmla za.s[x8, 2], { z24.s-z27.s }, z12.s\n"
+    "ld1w { z17.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    "ld1w { z24.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xa1404aa4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x21]\n"
+    "addvl x21, x21, #5\n"
+    ".inst 0xc1a3c848  // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
+    "st1w { z8.s }, p1, [x11]\n"
+    "ld1w { z18.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13719e0  // fmla za.s[x8, 0], { z15.s-z18.s }, z7.s\n"
+    "add x11, x11, x9, LSL #2\n"
+    ".inst 0xc13c19e1  // fmla za.s[x8, 1], { z15.s-z18.s }, z12.s\n"
+    ".inst 0xa1404aa7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x21]\n"
+    "st1w { z9.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    ".inst 0xa1414aa6  // ld1w { z6.s, z14.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    "addvl x21, x21, #5\n"
+    "st1w { z10.s }, p1, [x27]\n"
+    "add x27, x27, x25, LSL #2\n"
+    "ld1w { z25.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z14.s\n"
+    "st1w { z11.s }, p1, [x26]\n"
+    ".inst 0xc13f1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z15.s\n"
+    "ld1w { z19.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0xa0404aae  // ld1w { z14.s-z15.s }, pn10.b/Z, [x21]\n"
+    ".inst 0xc13f1a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z15.s\n"
+    ".inst 0xa1414aa4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    "addvl x21, x21, #5\n"
+    ".inst 0xc13c1a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z12.s\n"
+    "ld1w { z26.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    ".inst 0xa0404aac  // ld1w { z12.s-z13.s }, pn10.b/Z, [x21]\n"
+    ".inst 0xc13d1ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z13.s\n"
+    ".inst 0xa1414aa4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    "addvl x21, x21, #5\n"
+    ".inst 0xc13c1ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z12.s\n"
+    "ld1w { z20.s }, p1/Z, [x22]\n"
+    ".inst 0xa1404aa7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x21]\n"
+    ".inst 0xc13f1a21  // fmla za.s[x8, 1], { z17.s-z20.s }, z15.s\n"
+    ".inst 0xa0414aaa  // ld1w { z10.s-z11.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc13b1a20  // fmla za.s[x8, 0], { z17.s-z20.s }, z11.s\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    "ld1w { z22.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    "bgt 11b\n"
+    "b 19f\n"
+    "12:"  // Padded
+    "cbz x22, 17f\n"
+    "cmp x22, #0x1\n"
+    "sub x16, x16, x22\n"
+    "beq 16f\n"
+    "cmp x22, #0x2\n"
+    "beq 15f\n"
+    "cmp x22, #0x3\n"
+    "beq 14f\n"
+    "13:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z9.s }, p0/Z, [x14]\n"
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z10.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z11.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z12.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1341920  // fmla za.s[x8, 0], { z9.s-z12.s }, z4.s\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1371ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z7.s\n"
+    "ld1w { z13.s }, p0/Z, [x20]\n"
+    ".inst 0xa04049e6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1361940  // fmla za.s[x8, 0], { z10.s-z13.s }, z6.s\n"
+    "addvl x15, x15, #5\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa14049e6  // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1361b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z6.s\n"
+    "ld1w { z14.s }, p0/Z, [x20]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    ".inst 0xa04049e6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1361960  // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    "14:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z22.s }, p0/Z, [x14]\n"
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z9.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z10.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z11.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1351ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z5.s\n"
+    "ld1w { z12.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13f1920  // fmla za.s[x8, 0], { z9.s-z12.s }, z15.s\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    ".inst 0xa14049e0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1381ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
+    "addvl x15, x15, #5\n"
+    "ld1w { z13.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13f1940  // fmla za.s[x8, 0], { z10.s-z13.s }, z15.s\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    ".inst 0xa04049ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13f1b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z15.s\n"
+    ".inst 0xa14049e4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "15:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x14]\n"
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z17.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z18.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc13a1a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z10.s\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    ".inst 0xc1341a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z4.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    ".inst 0xc1301ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
+    ".inst 0xa14049e0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1371ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z7.s\n"
+    ".inst 0xa14149e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1371a20  // fmla za.s[x8, 0], { z17.s-z20.s }, z7.s\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1301a21  // fmla za.s[x8, 1], { z17.s-z20.s }, z0.s\n"
+    ".inst 0xa14049e5  // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13a1b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z10.s\n"
+    ".inst 0xc1351b01  // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    ".inst 0xa14049e5  // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc1351a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z5.s\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1301a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z0.s\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "16:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z19.s }, p0/Z, [x14]\n"
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z8.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z9.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z10.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc13b1a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
+    "ld1w { z11.s }, p0/Z, [x20]\n"
+    ".inst 0xc1351a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z5.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    ".inst 0xc1311900  // fmla za.s[x8, 0], { z8.s-z11.s }, z1.s\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc13f1901  // fmla za.s[x8, 1], { z8.s-z11.s }, z15.s\n"
+    ".inst 0xa14149e6  // ld1w { z6.s, z14.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13e1a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z14.s\n"
+    "ld1w { z12.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1351a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z5.s\n"
+    ".inst 0xa04049e6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa14149e0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1381920  // fmla za.s[x8, 0], { z9.s-z12.s }, z8.s\n"
+    ".inst 0xc1371921  // fmla za.s[x8, 1], { z9.s-z12.s }, z7.s\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    ".inst 0xa14049e0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc1381aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+    ".inst 0xa04149ec  // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13d1aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z13.s\n"
+    ".inst 0xa14049e4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "17:"  // Padded: 0 priming loads
+    "cmp x16, #0x2\n"
+    "blt 20f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z22.s }, p0/Z, [x14]\n"
+    "add x21, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z23.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z17.s }, p0/Z, [x21]\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z24.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z18.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z25.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z19.s }, p0/Z, [x21]\n"
+    "sub x16, x16, #0x2\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "sub x13, x13, #0x1\n"
+    "ld1w { z26.s }, p0/Z, [x21]\n"
+    "lsr x20, x16, #0x1\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "cmp x20, x13\n"
+    "ld1w { z20.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    "csel x23, x20, x13, LT\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    "and x16, x16, #0x1\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 19f\n"
+    "18:"  // Padded: Main loop
+    ".inst 0xc1391ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
+    "ld1w { z15.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13a1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "add x22, x14, %x[ld_in_row], LSL #2\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0xc1341ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
+    ".inst 0xa14049e5  // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1361a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
+    "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
+    ".inst 0xc1301a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
+    ".inst 0xa04149ec  // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    ".inst 0xc1371a02  // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
+    ".inst 0xa14049e6  // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13f1ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z15.s\n"
+    "ld1w { z16.s }, p0/Z, [x14]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13a1ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z10.s\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "add x21, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1351ae2  // fmla za.s[x8, 2], { z23.s-z26.s }, z5.s\n"
+    ".inst 0xa04049ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc1311a20  // fmla za.s[x8, 0], { z17.s-z20.s }, z1.s\n"
+    "ld1w { z0.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13c1a21  // fmla za.s[x8, 1], { z17.s-z20.s }, z12.s\n"
+    "ld1w { z12.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1361a22  // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
+    "ld1w { z17.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1w { z13.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1301b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z0.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13a1b01  // fmla za.s[x8, 1], { z24.s-z27.s }, z10.s\n"
+    ".inst 0xa1414a81  // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc13e1b02  // fmla za.s[x8, 2], { z24.s-z27.s }, z14.s\n"
+    "ld1w { z18.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z14.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    ".inst 0xa0404a80  // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
+    "add x8, x8, #0x1\n"
+    "addvl x20, x20, #5\n"
+    ".inst 0xc1a3c858  // fclamp { z24.s-z27.s }, z2.s, z3.s\n"
+    "ld1w { z19.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "st1w { z24.s }, p1, [x11]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1391a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z9.s\n"
+    ".inst 0xa0404a88  // ld1w { z8.s-z9.s }, pn10.b/Z, [x20]\n"
+    "add x11, x11, x9, LSL #2\n"
+    ".inst 0xc1311a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z1.s\n"
+    ".inst 0xa0414a80  // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "addvl x20, x20, #5\n"
+    "st1w { z25.s }, p1, [x10]\n"
+    "ld1w { z15.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1311980  // fmla za.s[x8, 0], { z12.s-z15.s }, z1.s\n"
+    ".inst 0xc1391981  // fmla za.s[x8, 1], { z12.s-z15.s }, z9.s\n"
+    "ld1w { z20.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xa0404a8a  // ld1w { z10.s-z11.s }, pn10.b/Z, [x20]\n"
+    ".inst 0xc13b1a21  // fmla za.s[x8, 1], { z17.s-z20.s }, z11.s\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z26.s }, p1, [x27]\n"
+    ".inst 0xa1414a80  // ld1w { z0.s, z8.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "addvl x20, x20, #5\n"
+    ".inst 0xc1381a20  // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
+    "add x27, x27, x25, LSL #2\n"
+    "ld1w { z16.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "st1w { z27.s }, p1, [x26]\n"
+    ".inst 0xa0404a88  // ld1w { z8.s-z9.s }, pn10.b/Z, [x20]\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc13919a1  // fmla za.s[x8, 1], { z13.s-z16.s }, z9.s\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0xa1414a81  // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "addvl x20, x20, #5\n"
+    ".inst 0xc13919a0  // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
+    "ld1w { z21.s }, p0/Z, [x22]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    "ld1w { z22.s }, p0/Z, [x14]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    ".inst 0xa0404a8e  // ld1w { z14.s-z15.s }, pn10.b/Z, [x20]\n"
+    ".inst 0xc13f1a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z15.s\n"
+    ".inst 0xa0414a80  // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1311a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    "ld1w { z16.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z23.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1w { z17.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z24.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z18.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z25.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z19.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z26.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "ld1w { z20.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    "bgt 18b\n"
+    "19:"  // Main loop tail
+    ".inst 0xc1391ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
+    "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13a1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "add x21, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1341ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
+    ".inst 0xa14049e1  // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1361a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
+    "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+    ".inst 0xc1301a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
+    ".inst 0xa04149ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    ".inst 0xc1371a02  // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
+    ".inst 0xa04049ec  // ld1w { z12.s-z13.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1381ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
+    "ld1w { z16.s }, p0/Z, [x14]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13a1ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z10.s\n"
+    ".inst 0xa14149e5  // ld1w { z5.s, z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    ".inst 0xc1311ae2  // fmla za.s[x8, 2], { z23.s-z26.s }, z1.s\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc1391a20  // fmla za.s[x8, 0], { z17.s-z20.s }, z9.s\n"
+    "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13e1a21  // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
+    "ld1w { z22.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc13c1a22  // fmla za.s[x8, 2], { z17.s-z20.s }, z12.s\n"
+    "ld1w { z17.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1w { z23.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1311b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z1.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1351b01  // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
+    ".inst 0xa0414a8e  // ld1w { z14.s-z15.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1371b02  // fmla za.s[x8, 2], { z24.s-z27.s }, z7.s\n"
+    "ld1w { z18.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z24.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+    ".inst 0xa0404a84  // ld1w { z4.s-z5.s }, pn10.b/Z, [x20]\n"
+    "add x8, x8, #0x1\n"
+    "addvl x20, x20, #5\n"
+    ".inst 0xc1a3c848  // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
+    "ld1w { z19.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "st1w { z8.s }, p1, [x11]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc13f1a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z15.s\n"
+    ".inst 0xa0404a80  // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
+    "add x11, x11, x9, LSL #2\n"
+    ".inst 0xc1351a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z5.s\n"
+    ".inst 0xa1414a80  // ld1w { z0.s, z8.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "addvl x20, x20, #5\n"
+    "st1w { z9.s }, p1, [x10]\n"
+    "ld1w { z25.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1381ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z8.s\n"
+    ".inst 0xc1311ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z1.s\n"
+    "ld1w { z20.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xa0404a86  // ld1w { z6.s-z7.s }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1371a21  // fmla za.s[x8, 1], { z17.s-z20.s }, z7.s\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z10.s }, p1, [x27]\n"
+    ".inst 0xa1414a81  // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "addvl x20, x20, #5\n"
+    ".inst 0xc1391a20  // fmla za.s[x8, 0], { z17.s-z20.s }, z9.s\n"
+    "add x27, x27, x25, LSL #2\n"
+    "ld1w { z26.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "st1w { z11.s }, p1, [x26]\n"
+    ".inst 0xa1404a84  // ld1w { z4.s, z12.s }, pn10.b/Z, [x20]\n"
+    ".inst 0xc13c1ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z12.s\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0xa1414a84  // ld1w { z4.s, z12.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "addvl x20, x20, #5\n"
+    ".inst 0xc13c1ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z12.s\n"
+    "ld1w { z21.s }, p0/Z, [x21]\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    ".inst 0xa0404a80  // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1311a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z1.s\n"
+    ".inst 0xa0414a80  // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1311a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+    ".inst 0xa14049e4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "20:"  // Main loop skip tail
+    "cbz x16, 21f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x14]\n"
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z17.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z18.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1391a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z9.s\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1361ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z6.s\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+    ".inst 0xc13a1a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z10.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1341a02  // fmla za.s[x8, 2], { z16.s-z19.s }, z4.s\n"
+    ".inst 0xa04049ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15]\n"
+    "sub x13, x13, #0x1\n"
+    ".inst 0xa04149ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1381a20  // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    ".inst 0xc1301ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z0.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+    ".inst 0xc1371ae2  // fmla za.s[x8, 2], { z23.s-z26.s }, z7.s\n"
+    ".inst 0xa04049e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc1391b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z9.s\n"
+    ".inst 0xa14149e5  // ld1w { z5.s, z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13e1a21  // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    ".inst 0xc13a1a22  // fmla za.s[x8, 2], { z17.s-z20.s }, z10.s\n"
+    "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+    ".inst 0xc1381a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z8.s\n"
+    ".inst 0xc1351b01  // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
+    ".inst 0xa04049e8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc1301b02  // fmla za.s[x8, 2], { z24.s-z27.s }, z0.s\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a3c858  // fclamp { z24.s-z27.s }, z2.s, z3.s\n"
+    "st1w { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9, LSL #2\n"
+    ".inst 0xc1301a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z0.s\n"
+    "st1w { z25.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    ".inst 0xc1381a42  // fmla za.s[x8, 2], { z18.s-z21.s }, z8.s\n"
+    "add x8, x8, #0x1\n"
+    "st1w { z26.s }, p1, [x27]\n"
+    "add x27, x27, x25, LSL #2\n"
+    "st1w { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    "21:"  // Tail input: End
+    "cbz x13, 23f\n"
+    "22:"  // Right padding loop
+    ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "subs x13, x13, #0x1\n"
+    ".inst 0xc1a3c848  // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
+    "st1w { z8.s }, p1, [x11]\n"
+    "add x11, x11, x9, LSL #2\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    "st1w { z9.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z10.s }, p1, [x27]\n"
+    "add x27, x27, x25, LSL #2\n"
+    "st1w { z11.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    "bgt 22b\n"
+    "23:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incb x20, ALL, MUL #16\n"
+    "incb x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x17\n"
+    "whilelt p1.s, x17, x7\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21, LSL #2\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..50ef6c3815
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+);
+
+class sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za : public PlanarStrategy<float, float>
+{
+  using Parent = PlanarStrategy<float, float>;
+
+  public:
+  using return_type = float;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..be82e04613
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,560 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+)
+{
+  struct Args
+  {
+    const float *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const float *weights;
+    const float *bias;
+    long unsigned int input_cols, output_cols;
+    float **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+    float clamp_min, clamp_max;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "mov x20, #0x6\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "sub x20, x20, x7\n"
+    "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ptrue p2.b\n"
+    "ld1rw { z25.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+    "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x16\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z13.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+    "whilelt p8.s, XZR, x17\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+    "fmov z26.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z26.s }, p1/Z, [x20, x15, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x21\n"
+    "fmov z6.s, #0x0\n"
+    "ld1w { z15.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #3\n"
+    "incb x21\n"
+    "ld1w { z29.s }, p2/Z, [x20]\n"
+    ".inst 0x648aa9e6  // bfcvtnt z6.h, p2/M, z15.s\n"
+    "incb x20, ALL, MUL #3\n"
+    "ld1w { z30.s }, p2/Z, [x20]\n"
+    "mov x20, x21\n"
+    ".inst 0x658aa9e5  // bfcvt z5.h, p2/M, z15.s\n"
+    "ld1w { z14.s }, p2/Z, [x20]\n"
+    ".inst 0x658aaba8  // bfcvt z8.h, p2/M, z29.s\n"
+    "fmov z11.s, #0x0\n"
+    "incb x20, ALL, MUL #3\n"
+    ".inst 0x658aa9ca  // bfcvt z10.h, p2/M, z14.s\n"
+    ".inst 0x648aaba5  // bfcvtnt z5.h, p2/M, z29.s\n"
+    "incb x21\n"
+    "ld1w { z24.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #3\n"
+    ".inst 0x648aabc8  // bfcvtnt z8.h, p2/M, z30.s\n"
+    ".inst 0x658aabcc  // bfcvt z12.h, p2/M, z30.s\n"
+    "ld1w { z28.s }, p2/Z, [x20]\n"
+    "mov x21, x21\n"
+    ".inst 0x648aa9cb  // bfcvtnt z11.h, p2/M, z14.s\n"
+    "ld1w { z20.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #3\n"
+    ".inst 0x648aab0a  // bfcvtnt z10.h, p2/M, z24.s\n"
+    ".inst 0x658aab09  // bfcvt z9.h, p2/M, z24.s\n"
+    "ld1w { z15.s }, p2/Z, [x21]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "incb x21, ALL, MUL #3\n"
+    "fmov z14.s, #0x0\n"
+    ".inst 0x658aaa81  // bfcvt z1.h, p2/M, z20.s\n"
+    "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+    ".inst 0x658aa9e7  // bfcvt z7.h, p2/M, z15.s\n"
+    ".inst 0x648aab89  // bfcvtnt z9.h, p2/M, z28.s\n"
+    "sub x20, x14, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #18\n"
+    ".inst 0x658aab84  // bfcvt z4.h, p2/M, z28.s\n"
+    "ld1w { z29.s }, p2/Z, [x21]\n"
+    "orr x23, x16, x23, LSL #20\n"
+    "mov x22, #0x6\n"
+    "add x21, x17, x7\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "mov z27.d, z26.d\n"
+    ".inst 0x648aaa8e  // bfcvtnt z14.h, p2/M, z20.s\n"
+    ".inst 0x648aa9e1  // bfcvtnt z1.h, p2/M, z15.s\n"
+    ".inst 0x648aaba7  // bfcvtnt z7.h, p2/M, z29.s\n"
+    "mov x8, #0x0\n"
+    "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+    ".inst 0x658aaba2  // bfcvt z2.h, p2/M, z29.s\n"
+    "lsl x23, x23, #0x2\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x17, x13\n"
+    "3:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col], LSL #2\n"
+    "bgt 3b\n"
+    "ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "msub x13, x17, x20, x13\n"
+    ".inst 0xc0040b40  // mova za.d[x8, #0], { z26.d-z27.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040b41  // mova za.d[x8, #1], { z26.d-z27.d }\n"
+    "mov x10, #0x2\n"
+    "ldp x9, x28, [x22], #0x10\n"
+    ".inst 0xc0040b42  // mova za.d[x8, #2], { z26.d-z27.d }\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    ".inst 0xc0040b43  // mova za.d[x8, #3], { z26.d-z27.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040b44  // mova za.d[x8, #4], { z26.d-z27.d }\n"
+    "ldp x25, x24, [x22], #0x10\n"
+    ".inst 0xc0040b45  // mova za.d[x8, #5], { z26.d-z27.d }\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "cbz x21, 5f\n"
+    "cmp x21, x10\n"
+    "csel x20, x21, x10, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x10, x10, x20\n"
+    "cbz x21, 5f\n"
+    ".inst 0xc0060814  // mova { z20.d-z21.d }, za.d[x8, #0]\n"
+    "sub x11, x11, x21\n"
+    ".inst 0xc0060836  // mova { z22.d-z23.d }, za.d[x8, #1]\n"
+    ".inst 0xc1adcb34  // fclamp { z20.s-z23.s }, z25.s, z13.s\n"
+    "4:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1w { z20.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z22.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "st1w { z21.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "st1w { z23.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 4b\n"
+    "5:"  // Left padding: End
+    "adds XZR, x17, x7\n"
+    "bne 10f\n"
+    "cbz x10, 8f\n"
+    "cmp x10, #0x1\n"
+    "sub x14, x14, x10\n"
+    "beq 7f\n"
+    "6:"  // Unpadded: 2 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z17.s }, p1/Z, [x13]\n"
+    ".inst 0x658aaa3e  // bfcvt z30.h, p2/M, z17.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab9e  // bfcvtnt z30.h, p2/M, z28.s\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa1f  // bfcvt z31.h, p2/M, z16.s\n"
+    "ld1w { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa9ff  // bfcvtnt z31.h, p2/M, z15.s\n"
+    ".inst 0xc12513d0  // bfdot za.s[x8, 0], { z30.h-z31.h }, z5.h\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa00  // bfcvt z0.h, p2/M, z16.s\n"
+    ".inst 0xc12613d1  // bfdot za.s[x8, 1], { z30.h-z31.h }, z6.h\n"
+    "ld1w { z15.s }, p1/Z, [x20]\n"
+    ".inst 0x648aa9e0  // bfcvtnt z0.h, p2/M, z15.s\n"
+    ".inst 0xc12c13f0  // bfdot za.s[x8, 0], { z31.h-z0.h }, z12.h\n"
+    ".inst 0xc12813f1  // bfdot za.s[x8, 1], { z31.h-z0.h }, z8.h\n"
+    "7:"  // Unpadded: 1 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z31.s }, p1/Z, [x13]\n"
+    ".inst 0x658aabef  // bfcvt z15.h, p2/M, z31.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa0f  // bfcvtnt z15.h, p2/M, z16.s\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa10  // bfcvt z16.h, p2/M, z16.s\n"
+    "ld1w { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa30  // bfcvtnt z16.h, p2/M, z17.s\n"
+    ".inst 0xc12a11f0  // bfdot za.s[x8, 0], { z15.h-z16.h }, z10.h\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaad1  // bfcvt z17.h, p2/M, z22.s\n"
+    ".inst 0xc12b11f1  // bfdot za.s[x8, 1], { z15.h-z16.h }, z11.h\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    ".inst 0x648aaa51  // bfcvtnt z17.h, p2/M, z18.s\n"
+    ".inst 0xc12511f2  // bfdot za.s[x8, 2], { z15.h-z16.h }, z5.h\n"
+    ".inst 0xc12611f3  // bfdot za.s[x8, 3], { z15.h-z16.h }, z6.h\n"
+    ".inst 0xc1241210  // bfdot za.s[x8, 0], { z16.h-z17.h }, z4.h\n"
+    ".inst 0xc1291211  // bfdot za.s[x8, 1], { z16.h-z17.h }, z9.h\n"
+    ".inst 0xc12c1212  // bfdot za.s[x8, 2], { z16.h-z17.h }, z12.h\n"
+    ".inst 0xc1281213  // bfdot za.s[x8, 3], { z16.h-z17.h }, z8.h\n"
+    "8:"  // Unpadded: 0 priming loads
+    "cbz x14, 16f\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x13]\n"
+    ".inst 0x658aaa16  // bfcvt z22.h, p2/M, z16.s\n"
+    "sub x14, x14, #0x1\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "sub x11, x11, #0x1\n"
+    ".inst 0x648aaa16  // bfcvtnt z22.h, p2/M, z16.s\n"
+    "ld1w { z0.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa817  // bfcvt z23.h, p2/M, z0.s\n"
+    "cmp x14, x11\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "csel x21, x14, x11, LT\n"
+    ".inst 0x648aab17  // bfcvtnt z23.h, p2/M, z24.s\n"
+    "ld1w { z0.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa818  // bfcvt z24.h, p2/M, z0.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    ".inst 0x648aaa18  // bfcvtnt z24.h, p2/M, z16.s\n"
+    "sub x11, x11, x21\n"
+    "cbz x21, 15f\n"
+    "9:"  // Unpadded: Main loop
+    ".inst 0xc12112d0  // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z0.s }, p1/Z, [x13]\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0xc12e12d1  // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12212f0  // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
+    ".inst 0xc12712f1  // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12a12d2  // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
+    "ld1w { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12b12d3  // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    ".inst 0xc12512d4  // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
+    ".inst 0xc12612d5  // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
+    ".inst 0x658aa816  // bfcvt z22.h, p2/M, z0.s\n"
+    ".inst 0x648aaa96  // bfcvtnt z22.h, p2/M, z20.s\n"
+    ".inst 0xc12412f2  // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
+    ".inst 0xc12912f3  // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
+    ".inst 0xc12c12f4  // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
+    ".inst 0xc12812f5  // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
+    ".inst 0x658aaa77  // bfcvt z23.h, p2/M, z19.s\n"
+    ".inst 0x658aaa38  // bfcvt z24.h, p2/M, z17.s\n"
+    ".inst 0xc0060810  // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+    ".inst 0x648aaa57  // bfcvtnt z23.h, p2/M, z18.s\n"
+    ".inst 0x648aab98  // bfcvtnt z24.h, p2/M, z28.s\n"
+    ".inst 0xc0060832  // mova { z18.d-z19.d }, za.d[x8, #1]\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc1adcb30  // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
+    "st1w { z16.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z18.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc0040b44  // mova za.d[x8, #4], { z26.d-z27.d }\n"
+    "st1w { z17.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc0040b45  // mova za.d[x8, #5], { z26.d-z27.d }\n"
+    "st1w { z19.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 9b\n"
+    "b 15f\n"
+    "10:"  // Padded
+    "cbz x10, 13f\n"
+    "cmp x10, #0x1\n"
+    "sub x14, x14, x10\n"
+    "beq 12f\n"
+    "11:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x13]\n"
+    ".inst 0x658aaa14  // bfcvt z20.h, p2/M, z16.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa14  // bfcvtnt z20.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa15  // bfcvt z21.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa15  // bfcvtnt z21.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1251290  // bfdot za.s[x8, 0], { z20.h-z21.h }, z5.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaaf6  // bfcvt z22.h, p2/M, z23.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa16  // bfcvtnt z22.h, p2/M, z16.s\n"
+    ".inst 0xc1261291  // bfdot za.s[x8, 1], { z20.h-z21.h }, z6.h\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc12c12b0  // bfdot za.s[x8, 0], { z21.h-z22.h }, z12.h\n"
+    ".inst 0xc12812b1  // bfdot za.s[x8, 1], { z21.h-z22.h }, z8.h\n"
+    "12:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x13]\n"
+    ".inst 0x658aaa13  // bfcvt z19.h, p2/M, z16.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa13  // bfcvtnt z19.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa14  // bfcvt z20.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa14  // bfcvtnt z20.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12a1270  // bfdot za.s[x8, 0], { z19.h-z20.h }, z10.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z15.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa9f5  // bfcvt z21.h, p2/M, z15.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa15  // bfcvtnt z21.h, p2/M, z16.s\n"
+    ".inst 0xc12b1271  // bfdot za.s[x8, 1], { z19.h-z20.h }, z11.h\n"
+    ".inst 0xc1251272  // bfdot za.s[x8, 2], { z19.h-z20.h }, z5.h\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1261273  // bfdot za.s[x8, 3], { z19.h-z20.h }, z6.h\n"
+    ".inst 0xc1241290  // bfdot za.s[x8, 0], { z20.h-z21.h }, z4.h\n"
+    ".inst 0xc1291291  // bfdot za.s[x8, 1], { z20.h-z21.h }, z9.h\n"
+    ".inst 0xc12c1292  // bfdot za.s[x8, 2], { z20.h-z21.h }, z12.h\n"
+    ".inst 0xc1281293  // bfdot za.s[x8, 3], { z20.h-z21.h }, z8.h\n"
+    "13:"  // Padded: 0 priming loads
+    "cbz x14, 16f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x13]\n"
+    ".inst 0x658aaa16  // bfcvt z22.h, p2/M, z16.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa16  // bfcvtnt z22.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa17  // bfcvt z23.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa17  // bfcvtnt z23.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa18  // bfcvt z24.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "sub x14, x14, #0x1\n"
+    ".inst 0x648aaa18  // bfcvtnt z24.h, p2/M, z16.s\n"
+    "sub x11, x11, #0x1\n"
+    "cmp x14, x11\n"
+    "csel x21, x14, x11, LT\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "sub x11, x11, x21\n"
+    "cbz x21, 15f\n"
+    "14:"  // Padded: Main loop
+    "mov x12, #0x0\n"
+    ".inst 0xc12112d0  // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z20.s }, p0/Z, [x13]\n"
+    ".inst 0xc12e12d1  // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z17.s }, p0/Z, [x20]\n"
+    ".inst 0xc12212f0  // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc12712f1  // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
+    "ld1w { z18.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    ".inst 0xc12a12d2  // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12b12d3  // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12512d4  // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z15.s }, p0/Z, [x20]\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0xc12612d5  // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
+    ".inst 0x658aaa96  // bfcvt z22.h, p2/M, z20.s\n"
+    ".inst 0x648aaa76  // bfcvtnt z22.h, p2/M, z19.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc12412f2  // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
+    ".inst 0xc12912f3  // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
+    ".inst 0xc12c12f4  // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
+    ".inst 0xc12812f5  // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
+    ".inst 0x658aaa37  // bfcvt z23.h, p2/M, z17.s\n"
+    ".inst 0x658aaa18  // bfcvt z24.h, p2/M, z16.s\n"
+    ".inst 0xc0060810  // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+    ".inst 0x648aaa57  // bfcvtnt z23.h, p2/M, z18.s\n"
+    ".inst 0x648aa9f8  // bfcvtnt z24.h, p2/M, z15.s\n"
+    ".inst 0xc0060832  // mova { z18.d-z19.d }, za.d[x8, #1]\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc1adcb30  // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
+    "st1w { z16.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z18.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc0040b44  // mova za.d[x8, #4], { z26.d-z27.d }\n"
+    "st1w { z17.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc0040b45  // mova za.d[x8, #5], { z26.d-z27.d }\n"
+    "st1w { z19.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 14b\n"
+    "15:"  // Main loop tail
+    ".inst 0xc12112d0  // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+    ".inst 0xc12e12d1  // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
+    ".inst 0xc12212f0  // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
+    ".inst 0xc12712f1  // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
+    ".inst 0xc12a12d2  // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
+    ".inst 0xc12b12d3  // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
+    ".inst 0xc12512d4  // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
+    ".inst 0xc12612d5  // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
+    ".inst 0xc0060810  // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+    ".inst 0xc0060832  // mova { z18.d-z19.d }, za.d[x8, #1]\n"
+    ".inst 0xc1adcb30  // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
+    "st1w { z16.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc12412f2  // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
+    "st1w { z18.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc12912f3  // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
+    "st1w { z17.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc12c12f4  // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
+    "st1w { z19.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    ".inst 0xc12812f5  // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0040b44  // mova za.d[x8, #4], { z26.d-z27.d }\n"
+    ".inst 0xc0040b45  // mova za.d[x8, #5], { z26.d-z27.d }\n"
+    "16:"  // Main loop skip tail
+    "cbz x11, 18f\n"
+    "17:"  // Right padding loop
+    ".inst 0xc006081c  // mova { z28.d-z29.d }, za.d[x8, #0]\n"
+    "subs x11, x11, #0x1\n"
+    ".inst 0xc006083e  // mova { z30.d-z31.d }, za.d[x8, #1]\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc1adcb3c  // fclamp { z28.s-z31.s }, z25.s, z13.s\n"
+    "st1w { z28.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z30.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc0040b44  // mova za.d[x8, #4], { z26.d-z27.d }\n"
+    "st1w { z29.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc0040b45  // mova za.d[x8, #5], { z26.d-z27.d }\n"
+    "st1w { z31.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 17b\n"
+    "18:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incb x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x15\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x15, x16\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21, LSL #2\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..e685884762
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+);
+
+class sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za : public PlanarStrategy<float, float>
+{
+  using Parent = PlanarStrategy<float, float>;
+
+  public:
+  using return_type = float;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..a3b9ca402a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,763 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+)
+{
+  struct Args
+  {
+    const float *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const float *weights;
+    const float *bias;
+    long unsigned int input_cols, output_cols;
+    float **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+    float clamp_min, clamp_max;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "mov x20, #0x9\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "sub x20, x20, x7\n"
+    "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ptrue p2.b\n"
+    "ld1rw { z4.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+    "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x16\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z1.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+    "whilelt p8.s, XZR, x17\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+    "fmov z24.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z24.s }, p1/Z, [x20, x15, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x21\n"
+    "ld1w { z18.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #3\n"
+    "incb x21\n"
+    "ld1w { z23.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #3\n"
+    ".inst 0x658aaa4e  // bfcvt z14.h, p2/M, z18.s\n"
+    "ld1w { z6.s }, p2/Z, [x20]\n"
+    "mov x20, x21\n"
+    ".inst 0x648aaaee  // bfcvtnt z14.h, p2/M, z23.s\n"
+    "incb x21\n"
+    "ld1w { z28.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #3\n"
+    ".inst 0x658aa8c3  // bfcvt z3.h, p2/M, z6.s\n"
+    ".inst 0x658aab88  // bfcvt z8.h, p2/M, z28.s\n"
+    "ld1w { z10.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #3\n"
+    "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+    ".inst 0x648aa948  // bfcvtnt z8.h, p2/M, z10.s\n"
+    "ld1w { z2.s }, p2/Z, [x20]\n"
+    "mov x21, x21\n"
+    ".inst 0x658aa847  // bfcvt z7.h, p2/M, z2.s\n"
+    "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ld1w { z9.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #3\n"
+    ".inst 0x658aa920  // bfcvt z0.h, p2/M, z9.s\n"
+    "sub x20, x14, #0x1\n"
+    "ld1w { z6.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #3\n"
+    "orr x23, x20, %x[ld_in_col], LSL #18\n"
+    "mov z25.d, z24.d\n"
+    "ld1w { z17.s }, p2/Z, [x21]\n"
+    "orr x23, x16, x23, LSL #20\n"
+    "mov x22, #0x9\n"
+    "mov z26.d, z24.d\n"
+    "add x21, x17, x7\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "mov z27.d, z24.d\n"
+    ".inst 0x648aa8c0  // bfcvtnt z0.h, p2/M, z6.s\n"
+    ".inst 0x658aaa26  // bfcvt z6.h, p2/M, z17.s\n"
+    "mov x8, #0x0\n"
+    "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "lsl x23, x23, #0x2\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x17, x13\n"
+    "3:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col], LSL #2\n"
+    "bgt 3b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "msub x13, x17, x20, x13\n"
+    ".inst 0xc0040f00  // mova za.d[x8, #0], { z24.d-z27.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040f01  // mova za.d[x8, #1], { z24.d-z27.d }\n"
+    "mov x22, #0x2\n"
+    "ldp x10, x9, [x23], #0x10\n"
+    ".inst 0xc0040f02  // mova za.d[x8, #2], { z24.d-z27.d }\n"
+    "ldp x28, x27, [x20], #0x10\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    "ldp x26, x25, [x23], #0x10\n"
+    "ldp x24, x23, [x20], #0x10\n"
+    "cbz x21, 5f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 5f\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "and x22, x21, #0x1\n"
+    "add x21, x21, #0x1\n"
+    ".inst 0xc1a1c890  // fclamp { z16.s-z19.s }, z4.s, z1.s\n"
+    "lsr x21, x21, #0x1\n"
+    "sub x11, x11, x21\n"
+    "4:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1w { z16.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z17.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z18.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    "st1w { z19.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "bgt 4b\n"
+    "5:"  // Left padding: End
+    "adds XZR, x17, x7\n"
+    "bne 10f\n"
+    "cbz x22, 8f\n"
+    "cmp x22, #0x1\n"
+    "sub x14, x14, x22\n"
+    "beq 7f\n"
+    "6:"  // Unpadded: 2 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z18.s }, p1/Z, [x13]\n"
+    ".inst 0x658aaa53  // bfcvt z19.h, p2/M, z18.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z12.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa993  // bfcvtnt z19.h, p2/M, z12.s\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaaf4  // bfcvt z20.h, p2/M, z23.s\n"
+    "ld1w { z2.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa854  // bfcvtnt z20.h, p2/M, z2.s\n"
+    "ld1w { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa9f5  // bfcvt z21.h, p2/M, z15.s\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaad5  // bfcvtnt z21.h, p2/M, z22.s\n"
+    "ld1w { z30.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aabd6  // bfcvt z22.h, p2/M, z30.s\n"
+    "ld1w { z12.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa996  // bfcvtnt z22.h, p2/M, z12.s\n"
+    ".inst 0xc13e1270  // bfdot za.s[x8, 0], { z19.h-z22.h }, z14.h\n"
+    "ld1w { z31.s }, p1/Z, [x20]\n"
+    ".inst 0x658aabf7  // bfcvt z23.h, p2/M, z31.s\n"
+    ".inst 0xc1331290  // bfdot za.s[x8, 0], { z20.h-z23.h }, z3.h\n"
+    "7:"  // Unpadded: 1 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z17.s }, p1/Z, [x13]\n"
+    ".inst 0x658aaa30  // bfcvt z16.h, p2/M, z17.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaad0  // bfcvtnt z16.h, p2/M, z22.s\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab91  // bfcvt z17.h, p2/M, z28.s\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa51  // bfcvtnt z17.h, p2/M, z18.s\n"
+    "ld1w { z2.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa852  // bfcvt z18.h, p2/M, z2.s\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa72  // bfcvtnt z18.h, p2/M, z19.s\n"
+    "ld1w { z2.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa853  // bfcvt z19.h, p2/M, z2.s\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaaf3  // bfcvtnt z19.h, p2/M, z23.s\n"
+    ".inst 0xc1381210  // bfdot za.s[x8, 0], { z16.h-z19.h }, z8.h\n"
+    "ld1w { z10.s }, p1/Z, [x20]\n"
+    ".inst 0x658aa954  // bfcvt z20.h, p2/M, z10.s\n"
+    ".inst 0xc1371230  // bfdot za.s[x8, 0], { z17.h-z20.h }, z7.h\n"
+    "8:"  // Unpadded: 0 priming loads
+    "cmp x14, #0x2\n"
+    "blt 16f\n"
+    "add x21, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x13]\n"
+    ".inst 0x658aaa09  // bfcvt z9.h, p2/M, z16.s\n"
+    "sub x14, x14, #0x2\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "sub x11, x11, #0x1\n"
+    ".inst 0x648aaa09  // bfcvtnt z9.h, p2/M, z16.s\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0a  // bfcvt z10.h, p2/M, z16.s\n"
+    "lsr x20, x14, #0x1\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "cmp x20, x11\n"
+    ".inst 0x648aaa0a  // bfcvtnt z10.h, p2/M, z16.s\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0b  // bfcvt z11.h, p2/M, z16.s\n"
+    "csel x22, x20, x11, LT\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa0b  // bfcvtnt z11.h, p2/M, z16.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0c  // bfcvt z12.h, p2/M, z16.s\n"
+    "and x14, x14, #0x1\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa0c  // bfcvtnt z12.h, p2/M, z16.s\n"
+    "sub x11, x11, x22\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    ".inst 0x658aaa0d  // bfcvt z13.h, p2/M, z16.s\n"
+    "cbz x22, 15f\n"
+    "9:"  // Unpadded: Main loop
+    "add x21, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x13]\n"
+    ".inst 0xc1301130  // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z15.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1131  // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
+    ".inst 0x658aaa09  // bfcvt z9.h, p2/M, z16.s\n"
+    "ld1w { z18.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1361150  // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1331151  // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
+    ".inst 0x658aaa4a  // bfcvt z10.h, p2/M, z18.s\n"
+    "ld1w { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aabcb  // bfcvt z11.h, p2/M, z30.s\n"
+    ".inst 0x648aa9e9  // bfcvtnt z9.h, p2/M, z15.s\n"
+    "ld1w { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa2a  // bfcvtnt z10.h, p2/M, z17.s\n"
+    ".inst 0x648aaa6b  // bfcvtnt z11.h, p2/M, z19.s\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0c  // bfcvt z12.h, p2/M, z16.s\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "ld1w { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa84c  // bfcvtnt z12.h, p2/M, z2.s\n"
+    "add x8, x8, #0x1\n"
+    "ld1w { z29.s }, p1/Z, [x13]\n"
+    ".inst 0xc1381130  // bfdot za.s[x8, 0], { z9.h-z12.h }, z8.h\n"
+    ".inst 0x658aaba9  // bfcvt z9.h, p2/M, z29.s\n"
+    "subs x22, x22, #0x1\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1a1c890  // fclamp { z16.s-z19.s }, z4.s, z1.s\n"
+    "st1w { z16.s }, p1, [x10]\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    ".inst 0x658aaa0d  // bfcvt z13.h, p2/M, z16.s\n"
+    ".inst 0xc1371150  // bfdot za.s[x8, 0], { z10.h-z13.h }, z7.h\n"
+    "add x10, x10, x28, LSL #2\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab8a  // bfcvt z10.h, p2/M, z28.s\n"
+    "st1w { z17.s }, p1, [x9]\n"
+    "ld1w { z31.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z18.s }, p1, [x26]\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0b  // bfcvt z11.h, p2/M, z16.s\n"
+    "add x26, x26, x24, LSL #2\n"
+    "ld1w { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "st1w { z19.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0c  // bfcvt z12.h, p2/M, z16.s\n"
+    ".inst 0xc0040f02  // mova za.d[x8, #2], { z24.d-z27.d }\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaac9  // bfcvtnt z9.h, p2/M, z22.s\n"
+    ".inst 0x648aabea  // bfcvtnt z10.h, p2/M, z31.s\n"
+    "ld1w { z31.s }, p1/Z, [x20]\n"
+    ".inst 0x648aaa2b  // bfcvtnt z11.h, p2/M, z17.s\n"
+    ".inst 0x648aaa0c  // bfcvtnt z12.h, p2/M, z16.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0x658aabed  // bfcvt z13.h, p2/M, z31.s\n"
+    "bgt 9b\n"
+    "b 15f\n"
+    "10:"  // Padded
+    "cbz x22, 13f\n"
+    "cmp x22, #0x1\n"
+    "sub x14, x14, x22\n"
+    "beq 12f\n"
+    "11:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x13]\n"
+    ".inst 0x658aaa09  // bfcvt z9.h, p2/M, z16.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z18.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa49  // bfcvtnt z9.h, p2/M, z18.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z12.s }, p0/Z, [x20]\n"
+    ".inst 0x658aa98a  // bfcvt z10.h, p2/M, z12.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z12.s }, p0/Z, [x20]\n"
+    ".inst 0x648aa98a  // bfcvtnt z10.h, p2/M, z12.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z18.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa4b  // bfcvt z11.h, p2/M, z18.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa0b  // bfcvtnt z11.h, p2/M, z16.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0c  // bfcvt z12.h, p2/M, z16.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0x648aaa0c  // bfcvtnt z12.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa0d  // bfcvt z13.h, p2/M, z16.s\n"
+    ".inst 0xc13e1130  // bfdot za.s[x8, 0], { z9.h-z12.h }, z14.h\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1331150  // bfdot za.s[x8, 0], { z10.h-z13.h }, z3.h\n"
+    "12:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x13]\n"
+    ".inst 0x658aaa0f  // bfcvt z15.h, p2/M, z16.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa0f  // bfcvtnt z15.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa10  // bfcvt z16.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa70  // bfcvtnt z16.h, p2/M, z19.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z13.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa9b1  // bfcvt z17.h, p2/M, z13.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z12.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa991  // bfcvtnt z17.h, p2/M, z12.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z9.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa932  // bfcvt z18.h, p2/M, z9.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z11.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0x648aa972  // bfcvtnt z18.h, p2/M, z11.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaab3  // bfcvt z19.h, p2/M, z21.s\n"
+    ".inst 0xc13811f0  // bfdot za.s[x8, 0], { z15.h-z18.h }, z8.h\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1371210  // bfdot za.s[x8, 0], { z16.h-z19.h }, z7.h\n"
+    "13:"  // Padded: 0 priming loads
+    "cmp x14, #0x2\n"
+    "blt 16f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x13]\n"
+    ".inst 0x658aaa09  // bfcvt z9.h, p2/M, z16.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa09  // bfcvtnt z9.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa0a  // bfcvt z10.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa0a  // bfcvtnt z10.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0b  // bfcvt z11.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa0b  // bfcvtnt z11.h, p2/M, z16.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0c  // bfcvt z12.h, p2/M, z16.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa0c  // bfcvtnt z12.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa0d  // bfcvt z13.h, p2/M, z16.s\n"
+    "sub x14, x14, #0x2\n"
+    "sub x11, x11, #0x1\n"
+    "lsr x20, x14, #0x1\n"
+    "cmp x20, x11\n"
+    "csel x21, x20, x11, LT\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "and x14, x14, #0x1\n"
+    "sub x11, x11, x21\n"
+    "cbz x21, 15f\n"
+    "14:"  // Padded: Main loop
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z18.s }, p0/Z, [x13]\n"
+    ".inst 0xc1301130  // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z17.s }, p0/Z, [x20]\n"
+    ".inst 0xc13e1131  // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0xc1361150  // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    ".inst 0xc1331151  // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa49  // bfcvt z9.h, p2/M, z18.s\n"
+    ".inst 0x658aaa0a  // bfcvt z10.h, p2/M, z16.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z2.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa84b  // bfcvt z11.h, p2/M, z2.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z15.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa29  // bfcvtnt z9.h, p2/M, z17.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z28.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab8c  // bfcvt z12.h, p2/M, z28.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z17.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa6a  // bfcvtnt z10.h, p2/M, z19.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z13.s }, p0/Z, [x20]\n"
+    ".inst 0x648aa9eb  // bfcvtnt z11.h, p2/M, z15.s\n"
+    "mov x12, #0x0\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0x648aaa2c  // bfcvtnt z12.h, p2/M, z17.s\n"
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa9ad  // bfcvt z13.h, p2/M, z13.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1a1c89c  // fclamp { z28.s-z31.s }, z4.s, z1.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z17.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "st1w { z28.s }, p1, [x10]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "st1w { z29.s }, p1, [x9]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "st1w { z30.s }, p1, [x26]\n"
+    "add x8, x8, #0x1\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1381130  // bfdot za.s[x8, 0], { z9.h-z12.h }, z8.h\n"
+    ".inst 0x658aaa09  // bfcvt z9.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z18.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1371150  // bfdot za.s[x8, 0], { z10.h-z13.h }, z7.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa2a  // bfcvt z10.h, p2/M, z17.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z17.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa6b  // bfcvt z11.h, p2/M, z19.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0x658aaa0c  // bfcvt z12.h, p2/M, z16.s\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "subs x21, x21, #0x1\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z31.s }, p1, [x25]\n"
+    ".inst 0xc0040f02  // mova za.d[x8, #2], { z24.d-z27.d }\n"
+    "add x9, x9, x27, LSL #2\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0x648aaaa9  // bfcvtnt z9.h, p2/M, z21.s\n"
+    ".inst 0x648aaa8a  // bfcvtnt z10.h, p2/M, z20.s\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0x648aaa4b  // bfcvtnt z11.h, p2/M, z18.s\n"
+    ".inst 0x648aaa2c  // bfcvtnt z12.h, p2/M, z17.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0x658aaa0d  // bfcvt z13.h, p2/M, z16.s\n"
+    "bgt 14b\n"
+    "15:"  // Main loop tail
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z17.s }, p0/Z, [x13]\n"
+    ".inst 0xc1301130  // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z2.s }, p0/Z, [x20]\n"
+    ".inst 0xc13e1131  // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0xc1361150  // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    ".inst 0xc1331151  // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa32  // bfcvt z18.h, p2/M, z17.s\n"
+    ".inst 0x658aaa13  // bfcvt z19.h, p2/M, z16.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa14  // bfcvt z20.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z15.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa852  // bfcvtnt z18.h, p2/M, z2.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa15  // bfcvt z21.h, p2/M, z16.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0x648aaaf3  // bfcvtnt z19.h, p2/M, z23.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0x648aa9f4  // bfcvtnt z20.h, p2/M, z15.s\n"
+    ".inst 0x648aaa15  // bfcvtnt z21.h, p2/M, z16.s\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0x658aaa16  // bfcvt z22.h, p2/M, z16.s\n"
+    ".inst 0xc1381250  // bfdot za.s[x8, 0], { z18.h-z21.h }, z8.h\n"
+    ".inst 0xc1a1c89c  // fclamp { z28.s-z31.s }, z4.s, z1.s\n"
+    "st1w { z28.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z29.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc0040f02  // mova za.d[x8, #2], { z24.d-z27.d }\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "st1w { z30.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0xc1371270  // bfdot za.s[x8, 0], { z19.h-z22.h }, z7.h\n"
+    "st1w { z31.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "16:"  // Main loop skip tail
+    "cbz x14, 17f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x13]\n"
+    ".inst 0x658aaa0f  // bfcvt z15.h, p2/M, z16.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa0f  // bfcvtnt z15.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa10  // bfcvt z16.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z2.s }, p0/Z, [x20]\n"
+    ".inst 0x648aa850  // bfcvtnt z16.h, p2/M, z2.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z10.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa951  // bfcvt z17.h, p2/M, z10.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z30.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aabd1  // bfcvtnt z17.h, p2/M, z30.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa72  // bfcvt z18.h, p2/M, z19.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0x648aaa72  // bfcvtnt z18.h, p2/M, z19.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa73  // bfcvt z19.h, p2/M, z19.s\n"
+    ".inst 0xc13011f0  // bfdot za.s[x8, 0], { z15.h-z18.h }, z0.h\n"
+    "sub x11, x11, #0x1\n"
+    ".inst 0xc1361210  // bfdot za.s[x8, 0], { z16.h-z19.h }, z6.h\n"
+    ".inst 0xc13e11f1  // bfdot za.s[x8, 1], { z15.h-z18.h }, z14.h\n"
+    ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a1c888  // fclamp { z8.s-z11.s }, z4.s, z1.s\n"
+    "st1w { z8.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    ".inst 0xc1331211  // bfdot za.s[x8, 1], { z16.h-z19.h }, z3.h\n"
+    "add x8, x8, #0x1\n"
+    "st1w { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z10.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0xc0040f02  // mova za.d[x8, #2], { z24.d-z27.d }\n"
+    "st1w { z11.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "17:"  // Tail input: End
+    "cbz x11, 19f\n"
+    "18:"  // Right padding loop
+    ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "subs x11, x11, #0x1\n"
+    ".inst 0xc1a1c888  // fclamp { z8.s-z11.s }, z4.s, z1.s\n"
+    "st1w { z8.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    ".inst 0xc0040f02  // mova za.d[x8, #2], { z24.d-z27.d }\n"
+    "st1w { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z10.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    "st1w { z11.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "bgt 18b\n"
+    "19:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incb x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x15\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x15, x16\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21, LSL #2\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..5215ccaf39
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+);
+
+class sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za : public PlanarStrategy<float, float>
+{
+  using Parent = PlanarStrategy<float, float>;
+
+  public:
+  using return_type = float;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..b72042558d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,1151 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+)
+{
+  struct Args
+  {
+    const float *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const float *weights;
+    const float *bias;
+    long unsigned int input_cols, output_cols;
+    float **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+    float clamp_min, clamp_max;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+  __asm__ __volatile__(
+    "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "mov x20, #0x8\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "sub x20, x20, x4\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ptrue p2.b\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ld1rw { z29.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+    "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x7\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z28.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+    "whilelt p8.s, XZR, x6\n"
+    "addvl SP, SP, #-30\n"
+    "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "1:"  // Channel loop
+    "ldr x21, [%x[args], %[offsetof_Args_bias]]\n"
+    "fmov z30.s, #0x0\n"
+    "cbz x21, 2f\n"
+    "ld1w { z30.s }, p1/Z, [x21, x17, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x21\n"
+    "ld1w { z12.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "ld1w { z24.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "fmov z11.s, #0x0\n"
+    "incb x21\n"
+    "ld1w { z3.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aa99a  // bfcvt z26.h, p2/M, z12.s\n"
+    ".inst 0x658aab10  // bfcvt z16.h, p2/M, z24.s\n"
+    "ld1w { z20.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "addvl x24, SP, #30\n"
+    ".inst 0x648aa98b  // bfcvtnt z11.h, p2/M, z12.s\n"
+    "ld1w { z25.s }, p2/Z, [x20]\n"
+    "mov x20, x21\n"
+    ".inst 0x658aa875  // bfcvt z21.h, p2/M, z3.s\n"
+    "addvl x24, x24, #-6\n"
+    "ld1w { z6.s }, p2/Z, [x20]\n"
+    ".inst 0x658aaa9b  // bfcvt z27.h, p2/M, z20.s\n"
+    "incb x20, ALL, MUL #5\n"
+    "st1h { z11.h }, p2, [x24]\n"
+    ".inst 0x648aab1a  // bfcvtnt z26.h, p2/M, z24.s\n"
+    "ld1w { z14.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "fmov z11.s, #0x0\n"
+    "st1h { z26.h }, p2, [x24, #1, MUL VL]\n"
+    ".inst 0x648aa870  // bfcvtnt z16.h, p2/M, z3.s\n"
+    "ld1w { z19.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aa8c9  // bfcvt z9.h, p2/M, z6.s\n"
+    ".inst 0x648aaa95  // bfcvtnt z21.h, p2/M, z20.s\n"
+    "incb x21\n"
+    "ld1w { z12.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "st1h { z16.h }, p2, [x24, #2, MUL VL]\n"
+    ".inst 0x648aab3b  // bfcvtnt z27.h, p2/M, z25.s\n"
+    ".inst 0x658aab37  // bfcvt z23.h, p2/M, z25.s\n"
+    "ld1w { z5.s }, p2/Z, [x20]\n"
+    ".inst 0x658aa9c8  // bfcvt z8.h, p2/M, z14.s\n"
+    "mov x23, x21\n"
+    "st1h { z21.h }, p2, [x24, #3, MUL VL]\n"
+    ".inst 0x648aa8cb  // bfcvtnt z11.h, p2/M, z6.s\n"
+    ".inst 0x658aaa79  // bfcvt z25.h, p2/M, z19.s\n"
+    "ld1w { z4.s }, p2/Z, [x23]\n"
+    "incb x23, ALL, MUL #5\n"
+    "st1h { z27.h }, p2, [x24, #4, MUL VL]\n"
+    ".inst 0x648aa9c9  // bfcvtnt z9.h, p2/M, z14.s\n"
+    ".inst 0x658aa991  // bfcvt z17.h, p2/M, z12.s\n"
+    "incb x21\n"
+    "st1h { z23.h }, p2, [x24, #5, MUL VL]\n"
+    "addvl x24, x24, #-6\n"
+    "ld1w { z26.s }, p2/Z, [x23]\n"
+    "incb x23, ALL, MUL #5\n"
+    "st1h { z11.h }, p2, [x24]\n"
+    "fmov z2.s, #0x0\n"
+    ".inst 0x648aaa68  // bfcvtnt z8.h, p2/M, z19.s\n"
+    "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
+    "ld1w { z27.s }, p2/Z, [x23]\n"
+    "incb x23, ALL, MUL #5\n"
+    ".inst 0x658aa893  // bfcvt z19.h, p2/M, z4.s\n"
+    "st1h { z8.h }, p2, [x24, #2, MUL VL]\n"
+    ".inst 0x648aa999  // bfcvtnt z25.h, p2/M, z12.s\n"
+    "ld1w { z7.s }, p2/Z, [x23]\n"
+    "incb x23, ALL, MUL #5\n"
+    ".inst 0x658aab4e  // bfcvt z14.h, p2/M, z26.s\n"
+    ".inst 0x648aa8b1  // bfcvtnt z17.h, p2/M, z5.s\n"
+    "st1h { z25.h }, p2, [x24, #3, MUL VL]\n"
+    "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+    ".inst 0x658aa8ab  // bfcvt z11.h, p2/M, z5.s\n"
+    "ld1w { z18.s }, p2/Z, [x23]\n"
+    "mov x20, x21\n"
+    ".inst 0x648aa882  // bfcvtnt z2.h, p2/M, z4.s\n"
+    ".inst 0x658aab66  // bfcvt z6.h, p2/M, z27.s\n"
+    "ld1w { z15.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "st1h { z17.h }, p2, [x24, #4, MUL VL]\n"
+    "st1h { z11.h }, p2, [x24, #5, MUL VL]\n"
+    "addvl x24, x24, #-6\n"
+    ".inst 0x648aab53  // bfcvtnt z19.h, p2/M, z26.s\n"
+    ".inst 0x658aa8fa  // bfcvt z26.h, p2/M, z7.s\n"
+    "ld1w { z11.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "st1h { z2.h }, p2, [x24]\n"
+    ".inst 0x648aab6e  // bfcvtnt z14.h, p2/M, z27.s\n"
+    "ld1w { z4.s }, p2/Z, [x20]\n"
+    "fmov z21.s, #0x0\n"
+    "st1h { z19.h }, p2, [x24, #1, MUL VL]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aa9ea  // bfcvt z10.h, p2/M, z15.s\n"
+    "st1h { z14.h }, p2, [x24, #2, MUL VL]\n"
+    ".inst 0x648aa8e6  // bfcvtnt z6.h, p2/M, z7.s\n"
+    "incb x21\n"
+    "ld1w { z17.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aa973  // bfcvt z19.h, p2/M, z11.s\n"
+    "st1h { z6.h }, p2, [x24, #3, MUL VL]\n"
+    ".inst 0x648aaa5a  // bfcvtnt z26.h, p2/M, z18.s\n"
+    ".inst 0x658aaa45  // bfcvt z5.h, p2/M, z18.s\n"
+    "ld1w { z12.s }, p2/Z, [x20]\n"
+    "mov x21, x21\n"
+    ".inst 0x658aa897  // bfcvt z23.h, p2/M, z4.s\n"
+    ".inst 0x648aa9f5  // bfcvtnt z21.h, p2/M, z15.s\n"
+    "ld1w { z24.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #5\n"
+    ".inst 0x648aa96a  // bfcvtnt z10.h, p2/M, z11.s\n"
+    "ld1w { z3.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #5\n"
+    "st1h { z26.h }, p2, [x24, #4, MUL VL]\n"
+    ".inst 0x648aa893  // bfcvtnt z19.h, p2/M, z4.s\n"
+    ".inst 0x658aaa30  // bfcvt z16.h, p2/M, z17.s\n"
+    "ld1w { z2.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #5\n"
+    ".inst 0x648aaa37  // bfcvtnt z23.h, p2/M, z17.s\n"
+    "ld1w { z26.s }, p2/Z, [x21]\n"
+    "st1h { z5.h }, p2, [x24, #5, MUL VL]\n"
+    "addvl x24, x24, #-6\n"
+    "st1h { z21.h }, p2, [x24]\n"
+    ".inst 0x648aa990  // bfcvtnt z16.h, p2/M, z12.s\n"
+    "incb x21, ALL, MUL #5\n"
+    "fmov z8.s, #0x0\n"
+    "st1h { z10.h }, p2, [x24, #1, MUL VL]\n"
+    ".inst 0x658aab04  // bfcvt z4.h, p2/M, z24.s\n"
+    ".inst 0x658aa985  // bfcvt z5.h, p2/M, z12.s\n"
+    "sub x20, x25, #0x1\n"
+    "st1h { z19.h }, p2, [x24, #2, MUL VL]\n"
+    ".inst 0x658aa871  // bfcvt z17.h, p2/M, z3.s\n"
+    "ld1w { z25.s }, p2/Z, [x21]\n"
+    "orr x23, x20, %x[ld_in_col], LSL #18\n"
+    "st1h { z23.h }, p2, [x24, #3, MUL VL]\n"
+    ".inst 0x658aa857  // bfcvt z23.h, p2/M, z2.s\n"
+    "orr x23, x7, x23, LSL #20\n"
+    "mov x22, #0x8\n"
+    "st1h { z16.h }, p2, [x24, #4, MUL VL]\n"
+    ".inst 0x658aab4e  // bfcvt z14.h, p2/M, z26.s\n"
+    "add x21, x6, x4\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "st1h { z5.h }, p2, [x24, #5, MUL VL]\n"
+    "addvl x24, x24, #-6\n"
+    "mov z31.d, z30.d\n"
+    ".inst 0x648aab08  // bfcvtnt z8.h, p2/M, z24.s\n"
+    "st1h { z8.h }, p2, [x24]\n"
+    ".inst 0x648aa864  // bfcvtnt z4.h, p2/M, z3.s\n"
+    ".inst 0x648aa851  // bfcvtnt z17.h, p2/M, z2.s\n"
+    "mov x11, #0x0\n"
+    "st1h { z4.h }, p2, [x24, #1, MUL VL]\n"
+    ".inst 0x648aab57  // bfcvtnt z23.h, p2/M, z26.s\n"
+    ".inst 0x648aab2e  // bfcvtnt z14.h, p2/M, z25.s\n"
+    "mov x8, #0x8\n"
+    "st1h { z17.h }, p2, [x24, #2, MUL VL]\n"
+    ".inst 0x658aab26  // bfcvt z6.h, p2/M, z25.s\n"
+    "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "lsl x23, x23, #0x2\n"
+    "st1h { z23.h }, p2, [x24, #3, MUL VL]\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x6, x16\n"
+    "st1h { z14.h }, p2, [x24, #4, MUL VL]\n"
+    "st1h { z6.h }, p2, [x24, #5, MUL VL]\n"
+    "3:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col], LSL #2\n"
+    "bgt 3b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "msub x16, x6, x20, x16\n"
+    ".inst 0xc0046bc0  // mova za.d[x11, #0], { z30.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0046bc1  // mova za.d[x11, #1], { z30.d-z31.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x14, x13, [x23], #0x10\n"
+    ".inst 0xc0046bc2  // mova za.d[x11, #2], { z30.d-z31.d }\n"
+    "ldp x5, x10, [x20], #0x10\n"
+    ".inst 0xc0046bc3  // mova za.d[x11, #3], { z30.d-z31.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0046bc4  // mova za.d[x11, #4], { z30.d-z31.d }\n"
+    "ldp x9, x28, [x23], #0x10\n"
+    ".inst 0xc0046bc5  // mova za.d[x11, #5], { z30.d-z31.d }\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    ".inst 0xc0046bc6  // mova za.d[x11, #6], { z30.d-z31.d }\n"
+    ".inst 0xc0046bc7  // mova za.d[x11, #7], { z30.d-z31.d }\n"
+    ".inst 0xc0040bc0  // mova za.d[x8, #0], { z30.d-z31.d }\n"
+    ".inst 0xc0040bc1  // mova za.d[x8, #1], { z30.d-z31.d }\n"
+    "cbz x21, 5f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 5f\n"
+    ".inst 0xc0066804  // mova { z4.d-z5.d }, za.d[x11, #0]\n"
+    "sub x15, x15, x21\n"
+    ".inst 0xc0066826  // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+    ".inst 0xc1bccba4  // fclamp { z4.s-z7.s }, z29.s, z28.s\n"
+    "4:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1w { z4.s }, p1, [x14]\n"
+    "add x14, x14, x5, LSL #2\n"
+    "st1w { z6.s }, p1, [x13]\n"
+    "add x13, x13, x10, LSL #2\n"
+    "st1w { z5.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z7.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "bgt 4b\n"
+    "5:"  // Left padding: End
+    "adds XZR, x6, x4\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x25, x25, x22\n"
+    "beq 9f\n"
+    "cmp x22, #0x2\n"
+    "beq 8f\n"
+    "cmp x22, #0x3\n"
+    "beq 7f\n"
+    "6:"  // Unpadded: 4 priming loads
+    "add x21, x16, %x[ld_in_row], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x16]\n"
+    ".inst 0x658aaab2  // bfcvt z18.h, p2/M, z21.s\n"
+    "addvl x20, SP, #24\n"
+    "ld1w { z11.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa972  // bfcvtnt z18.h, p2/M, z11.s\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "ld1w { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa33  // bfcvt z19.h, p2/M, z17.s\n"
+    "ld1w { z12.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa993  // bfcvtnt z19.h, p2/M, z12.s\n"
+    "ld1w { z7.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa8f4  // bfcvt z20.h, p2/M, z7.s\n"
+    "ld1w { z12.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa994  // bfcvtnt z20.h, p2/M, z12.s\n"
+    ".inst 0xa0402a8c  // ld1h { z12.h-z13.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc12d7250  // bfdot za.s[x11, 0], { z18.h-z19.h }, z13.h\n"
+    "ld1w { z6.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa8d5  // bfcvt z21.h, p2/M, z6.s\n"
+    ".inst 0xc12c7251  // bfdot za.s[x11, 1], { z18.h-z19.h }, z12.h\n"
+    ".inst 0xa0412a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc12b7270  // bfdot za.s[x11, 0], { z19.h-z20.h }, z11.h\n"
+    "ld1w { z27.s }, p1/Z, [x21]\n"
+    ".inst 0x648aab75  // bfcvtnt z21.h, p2/M, z27.s\n"
+    ".inst 0xc12a7271  // bfdot za.s[x11, 1], { z19.h-z20.h }, z10.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc12b7290  // bfdot za.s[x11, 0], { z20.h-z21.h }, z11.h\n"
+    ".inst 0xc12a7291  // bfdot za.s[x11, 1], { z20.h-z21.h }, z10.h\n"
+    "7:"  // Unpadded: 3 priming loads
+    "add x22, x16, %x[ld_in_row], LSL #2\n"
+    "ld1w { z6.s }, p1/Z, [x16]\n"
+    ".inst 0x658aa8d7  // bfcvt z23.h, p2/M, z6.s\n"
+    "addvl x21, SP, #18\n"
+    "ld1w { z1.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa837  // bfcvtnt z23.h, p2/M, z1.s\n"
+    "addvl x20, SP, #24\n"
+    "ld1w { z15.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa9f8  // bfcvt z24.h, p2/M, z15.s\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa18  // bfcvtnt z24.h, p2/M, z16.s\n"
+    "ld1w { z1.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa839  // bfcvt z25.h, p2/M, z1.s\n"
+    "ld1w { z9.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa939  // bfcvtnt z25.h, p2/M, z9.s\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc12972f0  // bfdot za.s[x11, 0], { z23.h-z24.h }, z9.h\n"
+    "ld1w { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa1a  // bfcvt z26.h, p2/M, z16.s\n"
+    ".inst 0xc12172f1  // bfdot za.s[x11, 1], { z23.h-z24.h }, z1.h\n"
+    ".inst 0xa1402a87  // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc12f72f2  // bfdot za.s[x11, 2], { z23.h-z24.h }, z15.h\n"
+    ".inst 0xa1412aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc12772f3  // bfdot za.s[x11, 3], { z23.h-z24.h }, z7.h\n"
+    "ld1w { z16.s }, p1/Z, [x22]\n"
+    ".inst 0x648aaa1a  // bfcvtnt z26.h, p2/M, z16.s\n"
+    ".inst 0xc1297310  // bfdot za.s[x11, 0], { z24.h-z25.h }, z9.h\n"
+    ".inst 0xc1217311  // bfdot za.s[x11, 1], { z24.h-z25.h }, z1.h\n"
+    ".inst 0xa1412a87  // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xa1422aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc12f7312  // bfdot za.s[x11, 2], { z24.h-z25.h }, z15.h\n"
+    ".inst 0xc1277313  // bfdot za.s[x11, 3], { z24.h-z25.h }, z7.h\n"
+    ".inst 0xc12b7330  // bfdot za.s[x11, 0], { z25.h-z26.h }, z11.h\n"
+    ".inst 0xc1237331  // bfdot za.s[x11, 1], { z25.h-z26.h }, z3.h\n"
+    ".inst 0xa0422a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1237332  // bfdot za.s[x11, 2], { z25.h-z26.h }, z3.h\n"
+    ".inst 0xc1227333  // bfdot za.s[x11, 3], { z25.h-z26.h }, z2.h\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x23, x16, %x[ld_in_row], LSL #2\n"
+    "ld1w { z24.s }, p1/Z, [x16]\n"
+    ".inst 0x658aab02  // bfcvt z2.h, p2/M, z24.s\n"
+    "addvl x22, SP, #12\n"
+    "ld1w { z16.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa02  // bfcvtnt z2.h, p2/M, z16.s\n"
+    "addvl x21, SP, #18\n"
+    "ld1w { z16.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa03  // bfcvt z3.h, p2/M, z16.s\n"
+    "addvl x20, SP, #24\n"
+    "ld1w { z16.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa03  // bfcvtnt z3.h, p2/M, z16.s\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "ld1w { z1.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa824  // bfcvt z4.h, p2/M, z1.s\n"
+    "ld1w { z19.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa64  // bfcvtnt z4.h, p2/M, z19.s\n"
+    ".inst 0xa1402ac7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc12f7050  // bfdot za.s[x11, 0], { z2.h-z3.h }, z15.h\n"
+    "ld1w { z0.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa805  // bfcvt z5.h, p2/M, z0.s\n"
+    ".inst 0xc1277051  // bfdot za.s[x11, 1], { z2.h-z3.h }, z7.h\n"
+    ".inst 0xa1402aa7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc12f7052  // bfdot za.s[x11, 2], { z2.h-z3.h }, z15.h\n"
+    ".inst 0xa1412ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc1277053  // bfdot za.s[x11, 3], { z2.h-z3.h }, z7.h\n"
+    "ld1w { z10.s }, p1/Z, [x23]\n"
+    ".inst 0x648aa945  // bfcvtnt z5.h, p2/M, z10.s\n"
+    ".inst 0xc12e7070  // bfdot za.s[x11, 0], { z3.h-z4.h }, z14.h\n"
+    ".inst 0xa1402a87  // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1267071  // bfdot za.s[x11, 1], { z3.h-z4.h }, z6.h\n"
+    ".inst 0xa0412aac  // ld1h { z12.h-z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc12f7054  // bfdot za.s[x11, 4], { z2.h-z3.h }, z15.h\n"
+    ".inst 0xa1422ac0  // ld1h { z0.h, z8.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc1277055  // bfdot za.s[x11, 5], { z2.h-z3.h }, z7.h\n"
+    ".inst 0xc12d7072  // bfdot za.s[x11, 2], { z3.h-z4.h }, z13.h\n"
+    ".inst 0xc12c7073  // bfdot za.s[x11, 3], { z3.h-z4.h }, z12.h\n"
+    ".inst 0xa0412a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1287090  // bfdot za.s[x11, 0], { z4.h-z5.h }, z8.h\n"
+    ".inst 0xc1207091  // bfdot za.s[x11, 1], { z4.h-z5.h }, z0.h\n"
+    ".inst 0xa0422aa6  // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc12f7074  // bfdot za.s[x11, 4], { z3.h-z4.h }, z15.h\n"
+    ".inst 0xc12e7075  // bfdot za.s[x11, 5], { z3.h-z4.h }, z14.h\n"
+    ".inst 0xc1277092  // bfdot za.s[x11, 2], { z4.h-z5.h }, z7.h\n"
+    ".inst 0xc1267093  // bfdot za.s[x11, 3], { z4.h-z5.h }, z6.h\n"
+    ".inst 0xa1422a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1287094  // bfdot za.s[x11, 4], { z4.h-z5.h }, z8.h\n"
+    ".inst 0xc1207095  // bfdot za.s[x11, 5], { z4.h-z5.h }, z0.h\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x24, x16, %x[ld_in_row], LSL #2\n"
+    "ld1w { z18.s }, p1/Z, [x16]\n"
+    ".inst 0x658aaa4c  // bfcvt z12.h, p2/M, z18.s\n"
+    "addvl x23, SP, #6\n"
+    "ld1w { z7.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa8ec  // bfcvtnt z12.h, p2/M, z7.s\n"
+    "addvl x22, SP, #12\n"
+    "ld1w { z20.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa8d  // bfcvt z13.h, p2/M, z20.s\n"
+    "addvl x21, SP, #18\n"
+    "ld1w { z0.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa80d  // bfcvtnt z13.h, p2/M, z0.s\n"
+    "addvl x20, SP, #24\n"
+    "ld1w { z10.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa94e  // bfcvt z14.h, p2/M, z10.s\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "ld1w { z0.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa80e  // bfcvtnt z14.h, p2/M, z0.s\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    ".inst 0xc1217190  // bfdot za.s[x11, 0], { z12.h-z13.h }, z1.h\n"
+    "ld1w { z17.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa2f  // bfcvt z15.h, p2/M, z17.s\n"
+    ".inst 0xc1207191  // bfdot za.s[x11, 1], { z12.h-z13.h }, z0.h\n"
+    ".inst 0xa0402aca  // ld1h { z10.h-z11.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc12b7192  // bfdot za.s[x11, 2], { z12.h-z13.h }, z11.h\n"
+    ".inst 0xa0412ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc12a7193  // bfdot za.s[x11, 3], { z12.h-z13.h }, z10.h\n"
+    "ld1w { z18.s }, p1/Z, [x24]\n"
+    ".inst 0x648aaa4f  // bfcvtnt z15.h, p2/M, z18.s\n"
+    ".inst 0xc12171b0  // bfdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
+    ".inst 0xa1402aa2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc12071b1  // bfdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
+    ".inst 0xa0412ac6  // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc12a7194  // bfdot za.s[x11, 4], { z12.h-z13.h }, z10.h\n"
+    ".inst 0xa0422aea  // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc1227195  // bfdot za.s[x11, 5], { z12.h-z13.h }, z2.h\n"
+    ".inst 0xa0402a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc12771b2  // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
+    ".inst 0xc12671b3  // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
+    ".inst 0xa0412aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc12b71d0  // bfdot za.s[x11, 0], { z14.h-z15.h }, z11.h\n"
+    ".inst 0xc12a71d1  // bfdot za.s[x11, 1], { z14.h-z15.h }, z10.h\n"
+    ".inst 0xa1422ac2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc1297196  // bfdot za.s[x11, 6], { z12.h-z13.h }, z9.h\n"
+    ".inst 0xc1287197  // bfdot za.s[x11, 7], { z12.h-z13.h }, z8.h\n"
+    ".inst 0xc12171b4  // bfdot za.s[x11, 4], { z13.h-z14.h }, z1.h\n"
+    ".inst 0xc12071b5  // bfdot za.s[x11, 5], { z13.h-z14.h }, z0.h\n"
+    ".inst 0xa1412a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc12a71d2  // bfdot za.s[x11, 2], { z14.h-z15.h }, z10.h\n"
+    ".inst 0xc12271d3  // bfdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
+    ".inst 0xa0422aa6  // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc12b71b6  // bfdot za.s[x11, 6], { z13.h-z14.h }, z11.h\n"
+    ".inst 0xc12371b7  // bfdot za.s[x11, 7], { z13.h-z14.h }, z3.h\n"
+    ".inst 0xc12771d4  // bfdot za.s[x11, 4], { z14.h-z15.h }, z7.h\n"
+    ".inst 0xc12671d5  // bfdot za.s[x11, 5], { z14.h-z15.h }, z6.h\n"
+    ".inst 0xa0422a86  // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc12771d6  // bfdot za.s[x11, 6], { z14.h-z15.h }, z7.h\n"
+    ".inst 0xc12671d7  // bfdot za.s[x11, 7], { z14.h-z15.h }, z6.h\n"
+    "10:"  // Unpadded: 0 priming loads
+    ".inst 0xa1402be6  // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xa1422be4  // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "cbz x25, 20f\n"
+    "add x20, x16, %x[ld_in_row], LSL #2\n"
+    "ld1w { z1.s }, p1/Z, [x16]\n"
+    ".inst 0x658aa834  // bfcvt z20.h, p2/M, z1.s\n"
+    "sub x25, x25, #0x1\n"
+    "ld1w { z10.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "sub x15, x15, #0x1\n"
+    ".inst 0x648aa954  // bfcvtnt z20.h, p2/M, z10.s\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa15  // bfcvt z21.h, p2/M, z16.s\n"
+    "cmp x25, x15\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "csel x25, x25, x15, LT\n"
+    ".inst 0x648aaa75  // bfcvtnt z21.h, p2/M, z19.s\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaaf6  // bfcvt z22.h, p2/M, z23.s\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab76  // bfcvtnt z22.h, p2/M, z27.s\n"
+    "sub x15, x15, x25\n"
+    "ld1w { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa9f7  // bfcvt z23.h, p2/M, z15.s\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    ".inst 0x648aaa17  // bfcvtnt z23.h, p2/M, z16.s\n"
+    "cbz x25, 19f\n"
+    "11:"  // Unpadded: Main loop
+    "addvl x24, SP, #6\n"
+    ".inst 0xc12e7290  // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+    "addvl x23, SP, #12\n"
+    "ld1w { z27.s }, p1/Z, [x16]\n"
+    ".inst 0xc1267291  // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa1402b01  // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
+    "addvl x22, SP, #18\n"
+    "addvl x21, SP, #24\n"
+    ".inst 0xc1297292  // bfdot za.s[x11, 2], { z20.h-z21.h }, z9.h\n"
+    "add x20, x16, %x[ld_in_row], LSL #2\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1217293  // bfdot za.s[x11, 3], { z20.h-z21.h }, z1.h\n"
+    ".inst 0xa1402ae6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+    "subs x25, x25, #0x1\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc12d72b0  // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12572b1  // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
+    ".inst 0xa1412b07  // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+    ".inst 0xc12e7294  // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1267295  // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa1402ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc12f72b2  // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12772b3  // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
+    ".inst 0xa1412ae6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc12c72d0  // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12472d1  // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+    ".inst 0xa1422b07  // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+    ".inst 0xc12d7296  // bfdot za.s[x11, 6], { z20.h-z21.h }, z13.h\n"
+    "ld1w { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1257297  // bfdot za.s[x11, 7], { z20.h-z21.h }, z5.h\n"
+    ".inst 0xa1402aa4  // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc12e72b4  // bfdot za.s[x11, 4], { z21.h-z22.h }, z14.h\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    ".inst 0xc12672b5  // bfdot za.s[x11, 5], { z21.h-z22.h }, z6.h\n"
+    ".inst 0xa1412ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc12f72d2  // bfdot za.s[x11, 2], { z22.h-z23.h }, z15.h\n"
+    ".inst 0xc12772d3  // bfdot za.s[x11, 3], { z22.h-z23.h }, z7.h\n"
+    ".inst 0xa1422ae7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc12e72b6  // bfdot za.s[x11, 6], { z21.h-z22.h }, z14.h\n"
+    ".inst 0xc12672b7  // bfdot za.s[x11, 7], { z21.h-z22.h }, z6.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc12f72d4  // bfdot za.s[x11, 4], { z22.h-z23.h }, z15.h\n"
+    ".inst 0xc12772d5  // bfdot za.s[x11, 5], { z22.h-z23.h }, z7.h\n"
+    ".inst 0xa0422ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc12f72d6  // bfdot za.s[x11, 6], { z22.h-z23.h }, z15.h\n"
+    ".inst 0xc12e72d7  // bfdot za.s[x11, 7], { z22.h-z23.h }, z14.h\n"
+    ".inst 0xa1422aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc12c1290  // bfdot za.s[x8, 0], { z20.h-z21.h }, z12.h\n"
+    ".inst 0xc1241291  // bfdot za.s[x8, 1], { z20.h-z21.h }, z4.h\n"
+    ".inst 0x658aab74  // bfcvt z20.h, p2/M, z27.s\n"
+    ".inst 0xa1402be6  // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc12d12b0  // bfdot za.s[x8, 0], { z21.h-z22.h }, z13.h\n"
+    ".inst 0x648aab54  // bfcvtnt z20.h, p2/M, z26.s\n"
+    ".inst 0xc12512b1  // bfdot za.s[x8, 1], { z21.h-z22.h }, z5.h\n"
+    ".inst 0x658aab35  // bfcvt z21.h, p2/M, z25.s\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xc12912d0  // bfdot za.s[x8, 0], { z22.h-z23.h }, z9.h\n"
+    ".inst 0x648aab15  // bfcvtnt z21.h, p2/M, z24.s\n"
+    ".inst 0xc12112d1  // bfdot za.s[x8, 1], { z22.h-z23.h }, z1.h\n"
+    ".inst 0x658aaa76  // bfcvt z22.h, p2/M, z19.s\n"
+    ".inst 0x658aaa37  // bfcvt z23.h, p2/M, z17.s\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    ".inst 0xa1422be4  // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    ".inst 0x648aaa56  // bfcvtnt z22.h, p2/M, z18.s\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1bccba8  // fclamp { z8.s-z11.s }, z29.s, z28.s\n"
+    "st1w { z8.s }, p1, [x14]\n"
+    "add x14, x14, x5, LSL #2\n"
+    "st1w { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10, LSL #2\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc0040bc0  // mova za.d[x8, #0], { z30.d-z31.d }\n"
+    "st1w { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc0040bc1  // mova za.d[x8, #1], { z30.d-z31.d }\n"
+    ".inst 0x648aaa17  // bfcvtnt z23.h, p2/M, z16.s\n"
+    "st1w { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "bgt 11b\n"
+    "b 19f\n"
+    "12:"  // Padded
+    "cbz x22, 17f\n"
+    "cmp x22, #0x1\n"
+    "sub x25, x25, x22\n"
+    "beq 16f\n"
+    "cmp x22, #0x2\n"
+    "beq 15f\n"
+    "cmp x22, #0x3\n"
+    "beq 14f\n"
+    "13:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x16]\n"
+    ".inst 0x658aaa06  // bfcvt z6.h, p2/M, z16.s\n"
+    "add x21, x16, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x21]\n"
+    ".inst 0x648aaa06  // bfcvtnt z6.h, p2/M, z16.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x21]\n"
+    ".inst 0x658aaa07  // bfcvt z7.h, p2/M, z16.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x21]\n"
+    ".inst 0x648aaa07  // bfcvtnt z7.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa08  // bfcvt z8.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "addvl x20, SP, #24\n"
+    "ld1w { z16.s }, p0/Z, [x21]\n"
+    ".inst 0x648aaa08  // bfcvtnt z8.h, p2/M, z16.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc12f70d0  // bfdot za.s[x11, 0], { z6.h-z7.h }, z15.h\n"
+    "ld1w { z9.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0x658aa929  // bfcvt z9.h, p2/M, z9.s\n"
+    ".inst 0xc12e70d1  // bfdot za.s[x11, 1], { z6.h-z7.h }, z14.h\n"
+    ".inst 0xa0412a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "ld1w { z16.s }, p0/Z, [x21]\n"
+    ".inst 0x648aaa09  // bfcvtnt z9.h, p2/M, z16.s\n"
+    ".inst 0xc12f70f0  // bfdot za.s[x11, 0], { z7.h-z8.h }, z15.h\n"
+    ".inst 0xc12e70f1  // bfdot za.s[x11, 1], { z7.h-z8.h }, z14.h\n"
+    ".inst 0xa0422a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1237110  // bfdot za.s[x11, 0], { z8.h-z9.h }, z3.h\n"
+    ".inst 0xc1227111  // bfdot za.s[x11, 1], { z8.h-z9.h }, z2.h\n"
+    "14:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x16]\n"
+    ".inst 0x658aaa09  // bfcvt z9.h, p2/M, z16.s\n"
+    "add x22, x16, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x22]\n"
+    ".inst 0x648aaa09  // bfcvtnt z9.h, p2/M, z16.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x22]\n"
+    ".inst 0x658aaa0a  // bfcvt z10.h, p2/M, z16.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x22]\n"
+    ".inst 0x648aaa0a  // bfcvtnt z10.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0b  // bfcvt z11.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "addvl x21, SP, #18\n"
+    "ld1w { z16.s }, p0/Z, [x22]\n"
+    ".inst 0x648aaa0b  // bfcvtnt z11.h, p2/M, z16.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa1402aa7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc12f7130  // bfdot za.s[x11, 0], { z9.h-z10.h }, z15.h\n"
+    "ld1w { z16.s }, p0/Z, [x22]\n"
+    "addvl x20, SP, #24\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0c  // bfcvt z12.h, p2/M, z16.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc1277131  // bfdot za.s[x11, 1], { z9.h-z10.h }, z7.h\n"
+    ".inst 0xa1402a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    ".inst 0xa1412aa7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc12e7132  // bfdot za.s[x11, 2], { z9.h-z10.h }, z14.h\n"
+    "ld1w { z16.s }, p0/Z, [x22]\n"
+    ".inst 0xc1267133  // bfdot za.s[x11, 3], { z9.h-z10.h }, z6.h\n"
+    ".inst 0x648aaa0c  // bfcvtnt z12.h, p2/M, z16.s\n"
+    ".inst 0xc12f7150  // bfdot za.s[x11, 0], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xa1422aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc1277151  // bfdot za.s[x11, 1], { z10.h-z11.h }, z7.h\n"
+    ".inst 0xa0412a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc12f7152  // bfdot za.s[x11, 2], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xc12e7153  // bfdot za.s[x11, 3], { z10.h-z11.h }, z14.h\n"
+    ".inst 0xc12d7170  // bfdot za.s[x11, 0], { z11.h-z12.h }, z13.h\n"
+    ".inst 0xc1257171  // bfdot za.s[x11, 1], { z11.h-z12.h }, z5.h\n"
+    ".inst 0xa0422a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc12f7172  // bfdot za.s[x11, 2], { z11.h-z12.h }, z15.h\n"
+    ".inst 0xc12e7173  // bfdot za.s[x11, 3], { z11.h-z12.h }, z14.h\n"
+    "15:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x16]\n"
+    ".inst 0x658aaa12  // bfcvt z18.h, p2/M, z16.s\n"
+    "add x23, x16, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x23]\n"
+    ".inst 0x648aaa12  // bfcvtnt z18.h, p2/M, z16.s\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x23]\n"
+    ".inst 0x658aaa13  // bfcvt z19.h, p2/M, z16.s\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x23]\n"
+    ".inst 0x648aaa13  // bfcvtnt z19.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa14  // bfcvt z20.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "addvl x22, SP, #12\n"
+    "ld1w { z16.s }, p0/Z, [x23]\n"
+    ".inst 0x648aaa14  // bfcvtnt z20.h, p2/M, z16.s\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa1402ac1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc1297250  // bfdot za.s[x11, 0], { z18.h-z19.h }, z9.h\n"
+    "ld1w { z26.s }, p0/Z, [x23]\n"
+    "addvl x21, SP, #18\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab55  // bfcvt z21.h, p2/M, z26.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc1217251  // bfdot za.s[x11, 1], { z18.h-z19.h }, z1.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xa1412ac7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc12e7252  // bfdot za.s[x11, 2], { z18.h-z19.h }, z14.h\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "ld1w { z16.s }, p0/Z, [x23]\n"
+    ".inst 0xc1267253  // bfdot za.s[x11, 3], { z18.h-z19.h }, z6.h\n"
+    ".inst 0x648aaa15  // bfcvtnt z21.h, p2/M, z16.s\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc12f7270  // bfdot za.s[x11, 0], { z19.h-z20.h }, z15.h\n"
+    ".inst 0xc1277271  // bfdot za.s[x11, 1], { z19.h-z20.h }, z7.h\n"
+    ".inst 0xa1412aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xa1422ac7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc12d7254  // bfdot za.s[x11, 4], { z18.h-z19.h }, z13.h\n"
+    ".inst 0xc1257255  // bfdot za.s[x11, 5], { z18.h-z19.h }, z5.h\n"
+    ".inst 0xc12e7272  // bfdot za.s[x11, 2], { z19.h-z20.h }, z14.h\n"
+    ".inst 0xc1267273  // bfdot za.s[x11, 3], { z19.h-z20.h }, z6.h\n"
+    ".inst 0xa1412a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc12f7290  // bfdot za.s[x11, 0], { z20.h-z21.h }, z15.h\n"
+    ".inst 0xc1277291  // bfdot za.s[x11, 1], { z20.h-z21.h }, z7.h\n"
+    ".inst 0xa0422aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc12d7274  // bfdot za.s[x11, 4], { z19.h-z20.h }, z13.h\n"
+    ".inst 0xc1257275  // bfdot za.s[x11, 5], { z19.h-z20.h }, z5.h\n"
+    ".inst 0xc12f7292  // bfdot za.s[x11, 2], { z20.h-z21.h }, z15.h\n"
+    ".inst 0xc12e7293  // bfdot za.s[x11, 3], { z20.h-z21.h }, z14.h\n"
+    ".inst 0xa0422a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1237294  // bfdot za.s[x11, 4], { z20.h-z21.h }, z3.h\n"
+    ".inst 0xc1227295  // bfdot za.s[x11, 5], { z20.h-z21.h }, z2.h\n"
+    "16:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x16]\n"
+    ".inst 0x658aaa09  // bfcvt z9.h, p2/M, z16.s\n"
+    "add x24, x16, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x24]\n"
+    ".inst 0x648aaa09  // bfcvtnt z9.h, p2/M, z16.s\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x24]\n"
+    ".inst 0x658aaa0a  // bfcvt z10.h, p2/M, z16.s\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x24]\n"
+    ".inst 0x648aaa0a  // bfcvtnt z10.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0b  // bfcvt z11.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "addvl x23, SP, #6\n"
+    "ld1w { z16.s }, p0/Z, [x24]\n"
+    ".inst 0x648aaa0b  // bfcvtnt z11.h, p2/M, z16.s\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa1402ae7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x23]\n"
+    ".inst 0xc12f7130  // bfdot za.s[x11, 0], { z9.h-z10.h }, z15.h\n"
+    "ld1w { z16.s }, p0/Z, [x24]\n"
+    "addvl x22, SP, #12\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0c  // bfcvt z12.h, p2/M, z16.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc1277131  // bfdot za.s[x11, 1], { z9.h-z10.h }, z7.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    "addvl x21, SP, #18\n"
+    ".inst 0xa1412ae7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc12e7132  // bfdot za.s[x11, 2], { z9.h-z10.h }, z14.h\n"
+    "addvl x20, SP, #24\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "ld1w { z16.s }, p0/Z, [x24]\n"
+    ".inst 0xc1267133  // bfdot za.s[x11, 3], { z9.h-z10.h }, z6.h\n"
+    ".inst 0x648aaa0c  // bfcvtnt z12.h, p2/M, z16.s\n"
+    ".inst 0xa1402aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc12f7150  // bfdot za.s[x11, 0], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xc1277151  // bfdot za.s[x11, 1], { z10.h-z11.h }, z7.h\n"
+    ".inst 0xa1412ac7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xa1422ae6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc12d7134  // bfdot za.s[x11, 4], { z9.h-z10.h }, z13.h\n"
+    ".inst 0xc1257135  // bfdot za.s[x11, 5], { z9.h-z10.h }, z5.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc12f7152  // bfdot za.s[x11, 2], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xc1277153  // bfdot za.s[x11, 3], { z10.h-z11.h }, z7.h\n"
+    ".inst 0xa1412aa7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc12e7170  // bfdot za.s[x11, 0], { z11.h-z12.h }, z14.h\n"
+    ".inst 0xc1267171  // bfdot za.s[x11, 1], { z11.h-z12.h }, z6.h\n"
+    ".inst 0xa1422ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc12d7136  // bfdot za.s[x11, 6], { z9.h-z10.h }, z13.h\n"
+    ".inst 0xc1257137  // bfdot za.s[x11, 7], { z9.h-z10.h }, z5.h\n"
+    ".inst 0xc12f7154  // bfdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xc1277155  // bfdot za.s[x11, 5], { z10.h-z11.h }, z7.h\n"
+    ".inst 0xa1412a87  // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc12e7172  // bfdot za.s[x11, 2], { z11.h-z12.h }, z14.h\n"
+    ".inst 0xc1267173  // bfdot za.s[x11, 3], { z11.h-z12.h }, z6.h\n"
+    ".inst 0xa1422aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc12f7156  // bfdot za.s[x11, 6], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xc1277157  // bfdot za.s[x11, 7], { z10.h-z11.h }, z7.h\n"
+    ".inst 0xc1297174  // bfdot za.s[x11, 4], { z11.h-z12.h }, z9.h\n"
+    ".inst 0xc1217175  // bfdot za.s[x11, 5], { z11.h-z12.h }, z1.h\n"
+    ".inst 0xa0422a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1217176  // bfdot za.s[x11, 6], { z11.h-z12.h }, z1.h\n"
+    ".inst 0xc1207177  // bfdot za.s[x11, 7], { z11.h-z12.h }, z0.h\n"
+    "17:"  // Padded: 0 priming loads
+    ".inst 0xa1402be6  // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xa1422be4  // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "cbz x25, 20f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x16]\n"
+    ".inst 0x658aaa14  // bfcvt z20.h, p2/M, z16.s\n"
+    "add x20, x16, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa14  // bfcvtnt z20.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa15  // bfcvt z21.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa15  // bfcvtnt z21.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa16  // bfcvt z22.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa16  // bfcvtnt z22.h, p2/M, z16.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa17  // bfcvt z23.h, p2/M, z16.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "sub x25, x25, #0x1\n"
+    ".inst 0x648aaa17  // bfcvtnt z23.h, p2/M, z16.s\n"
+    "sub x15, x15, #0x1\n"
+    "cmp x25, x15\n"
+    "csel x25, x25, x15, LT\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "sub x15, x15, x25\n"
+    "cbz x25, 19f\n"
+    "18:"  // Padded: Main loop
+    "addvl x24, SP, #6\n"
+    ".inst 0xc12e7290  // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+    "addvl x23, SP, #12\n"
+    ".inst 0xc1267291  // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa0402b02  // ld1h { z2.h-z3.h }, pn10.b/Z, [x24]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1237292  // bfdot za.s[x11, 2], { z20.h-z21.h }, z3.h\n"
+    "ld1w { z16.s }, p0/Z, [x16]\n"
+    "add x22, x16, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1227293  // bfdot za.s[x11, 3], { z20.h-z21.h }, z2.h\n"
+    ".inst 0xa1402ae6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #18\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xc12d72b0  // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
+    "ld1w { z19.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc12572b1  // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
+    ".inst 0xa1412b07  // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+    "subs x25, x25, #0x1\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc12e7294  // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
+    "ld1w { z17.s }, p0/Z, [x22]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    ".inst 0xc1267295  // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12f72b2  // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
+    "ld1w { z27.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc12772b3  // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
+    ".inst 0xa1412ae7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc12c72d0  // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
+    "ld1w { z10.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc12472d1  // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+    ".inst 0xa1422b04  // ld1h { z4.h, z12.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+    ".inst 0xc12e7296  // bfdot za.s[x11, 6], { z20.h-z21.h }, z14.h\n"
+    "ld1w { z8.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1267297  // bfdot za.s[x11, 7], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc12f72b4  // bfdot za.s[x11, 4], { z21.h-z22.h }, z15.h\n"
+    "ld1w { z11.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc12772b5  // bfdot za.s[x11, 5], { z21.h-z22.h }, z7.h\n"
+    ".inst 0xa0412aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc12c72d2  // bfdot za.s[x11, 2], { z22.h-z23.h }, z12.h\n"
+    "ld1w { z18.s }, p0/Z, [x22]\n"
+    ".inst 0xc12472d3  // bfdot za.s[x11, 3], { z22.h-z23.h }, z4.h\n"
+    ".inst 0xa1422ae4  // ld1h { z4.h, z12.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc12f72b6  // bfdot za.s[x11, 6], { z21.h-z22.h }, z15.h\n"
+    ".inst 0xc12e72b7  // bfdot za.s[x11, 7], { z21.h-z22.h }, z14.h\n"
+    ".inst 0xa1412a87  // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc12c72d4  // bfdot za.s[x11, 4], { z22.h-z23.h }, z12.h\n"
+    ".inst 0xc12472d5  // bfdot za.s[x11, 5], { z22.h-z23.h }, z4.h\n"
+    ".inst 0xa0422aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc12172d6  // bfdot za.s[x11, 6], { z22.h-z23.h }, z1.h\n"
+    ".inst 0xc12072d7  // bfdot za.s[x11, 7], { z22.h-z23.h }, z0.h\n"
+    ".inst 0xa0422a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc12d1290  // bfdot za.s[x8, 0], { z20.h-z21.h }, z13.h\n"
+    ".inst 0xc1251291  // bfdot za.s[x8, 1], { z20.h-z21.h }, z5.h\n"
+    ".inst 0x658aaa14  // bfcvt z20.h, p2/M, z16.s\n"
+    ".inst 0xa1402be6  // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc12f12b0  // bfdot za.s[x8, 0], { z21.h-z22.h }, z15.h\n"
+    ".inst 0x648aaa74  // bfcvtnt z20.h, p2/M, z19.s\n"
+    ".inst 0xc12712b1  // bfdot za.s[x8, 1], { z21.h-z22.h }, z7.h\n"
+    ".inst 0x658aaa35  // bfcvt z21.h, p2/M, z17.s\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xc12112d0  // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+    ".inst 0x648aab75  // bfcvtnt z21.h, p2/M, z27.s\n"
+    ".inst 0xc12012d1  // bfdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+    ".inst 0x658aa956  // bfcvt z22.h, p2/M, z10.s\n"
+    ".inst 0x658aa977  // bfcvt z23.h, p2/M, z11.s\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066800  // mova { z0.d-z1.d }, za.d[x11, #0]\n"
+    ".inst 0xa1422be4  // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    ".inst 0x648aa916  // bfcvtnt z22.h, p2/M, z8.s\n"
+    ".inst 0xc0066822  // mova { z2.d-z3.d }, za.d[x11, #1]\n"
+    ".inst 0xc1bccba0  // fclamp { z0.s-z3.s }, z29.s, z28.s\n"
+    "st1w { z0.s }, p1, [x14]\n"
+    "add x14, x14, x5, LSL #2\n"
+    "st1w { z2.s }, p1, [x13]\n"
+    "add x13, x13, x10, LSL #2\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc0040bc0  // mova za.d[x8, #0], { z30.d-z31.d }\n"
+    "st1w { z1.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc0040bc1  // mova za.d[x8, #1], { z30.d-z31.d }\n"
+    ".inst 0x648aaa57  // bfcvtnt z23.h, p2/M, z18.s\n"
+    "st1w { z3.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "bgt 18b\n"
+    "19:"  // Main loop tail
+    "addvl x23, SP, #6\n"
+    ".inst 0xc12e7290  // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+    "addvl x22, SP, #12\n"
+    ".inst 0xc1267291  // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #18\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xc1217292  // bfdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+    ".inst 0xc1207293  // bfdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc12d72b0  // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
+    ".inst 0xc12572b1  // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
+    ".inst 0xa1412ae7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc12e7294  // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
+    ".inst 0xc1267295  // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa1402aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc12f72b2  // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
+    ".inst 0xc12772b3  // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
+    ".inst 0xa1412ac7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc12c72d0  // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
+    ".inst 0xc12472d1  // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+    ".inst 0xa1422ae6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc12d7296  // bfdot za.s[x11, 6], { z20.h-z21.h }, z13.h\n"
+    ".inst 0xc1257297  // bfdot za.s[x11, 7], { z20.h-z21.h }, z5.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc12f72b4  // bfdot za.s[x11, 4], { z21.h-z22.h }, z15.h\n"
+    ".inst 0xc12772b5  // bfdot za.s[x11, 5], { z21.h-z22.h }, z7.h\n"
+    ".inst 0xa1412aa7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc12e72d2  // bfdot za.s[x11, 2], { z22.h-z23.h }, z14.h\n"
+    ".inst 0xc12672d3  // bfdot za.s[x11, 3], { z22.h-z23.h }, z6.h\n"
+    ".inst 0xa1422ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc12f72b6  // bfdot za.s[x11, 6], { z21.h-z22.h }, z15.h\n"
+    ".inst 0xc12772b7  // bfdot za.s[x11, 7], { z21.h-z22.h }, z7.h\n"
+    ".inst 0xa1412a87  // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc12e72d4  // bfdot za.s[x11, 4], { z22.h-z23.h }, z14.h\n"
+    ".inst 0xc12672d5  // bfdot za.s[x11, 5], { z22.h-z23.h }, z6.h\n"
+    ".inst 0xa1422aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc12b72d6  // bfdot za.s[x11, 6], { z22.h-z23.h }, z11.h\n"
+    ".inst 0xc12372d7  // bfdot za.s[x11, 7], { z22.h-z23.h }, z3.h\n"
+    ".inst 0xa0422a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc12d1290  // bfdot za.s[x8, 0], { z20.h-z21.h }, z13.h\n"
+    ".inst 0xc1251291  // bfdot za.s[x8, 1], { z20.h-z21.h }, z5.h\n"
+    ".inst 0xc12f12b0  // bfdot za.s[x8, 0], { z21.h-z22.h }, z15.h\n"
+    ".inst 0xc12712b1  // bfdot za.s[x8, 1], { z21.h-z22.h }, z7.h\n"
+    ".inst 0xc12312d0  // bfdot za.s[x8, 0], { z22.h-z23.h }, z3.h\n"
+    ".inst 0xc12212d1  // bfdot za.s[x8, 1], { z22.h-z23.h }, z2.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066814  // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+    ".inst 0xc0066836  // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+    ".inst 0xc1bccbb4  // fclamp { z20.s-z23.s }, z29.s, z28.s\n"
+    "st1w { z20.s }, p1, [x14]\n"
+    "add x14, x14, x5, LSL #2\n"
+    "st1w { z22.s }, p1, [x13]\n"
+    "add x13, x13, x10, LSL #2\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc0040bc0  // mova za.d[x8, #0], { z30.d-z31.d }\n"
+    "st1w { z21.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc0040bc1  // mova za.d[x8, #1], { z30.d-z31.d }\n"
+    "st1w { z23.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "20:"  // Main loop skip tail
+    "cbz x15, 22f\n"
+    "21:"  // Right padding loop
+    ".inst 0xc0066800  // mova { z0.d-z1.d }, za.d[x11, #0]\n"
+    "add x8, x8, #0x2\n"
+    "subs x15, x15, #0x1\n"
+    ".inst 0xc0066822  // mova { z2.d-z3.d }, za.d[x11, #1]\n"
+    ".inst 0xc1bccba0  // fclamp { z0.s-z3.s }, z29.s, z28.s\n"
+    "st1w { z0.s }, p1, [x14]\n"
+    "add x14, x14, x5, LSL #2\n"
+    "st1w { z2.s }, p1, [x13]\n"
+    "add x13, x13, x10, LSL #2\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc0040bc0  // mova za.d[x8, #0], { z30.d-z31.d }\n"
+    "st1w { z1.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc0040bc1  // mova za.d[x8, #1], { z30.d-z31.d }\n"
+    "st1w { z3.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "bgt 21b\n"
+    "22:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incb x20, ALL, MUL #16\n"
+    "incb x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x17\n"
+    "whilelt p1.s, x17, x7\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21, LSL #2\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #30\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..53e596418b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+);
+
+class sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za : public PlanarStrategy<float, float>
+{
+  using Parent = PlanarStrategy<float, float>;
+
+  public:
+  using return_type = float;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..3a56e69d26
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,1246 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+)
+{
+  struct Args
+  {
+    const float *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const float *weights;
+    const float *bias;
+    long unsigned int input_cols, output_cols;
+    float **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+    float clamp_min, clamp_max;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 11u - std::min(11u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+  __asm__ __volatile__(
+    "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "mov x20, #0xb\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "sub x20, x20, x3\n"
+    "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ptrue p2.b\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ld1rw { z13.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+    "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x5\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z12.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+    "whilelt p8.s, XZR, x4\n"
+    "addvl SP, SP, #-15\n"
+    "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+    "fmov z16.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z16.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x21\n"
+    "ld1w { z31.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "ld1w { z8.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aabef  // bfcvt z15.h, p2/M, z31.s\n"
+    "incb x21\n"
+    "ld1w { z18.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aaa4e  // bfcvt z14.h, p2/M, z18.s\n"
+    "addvl x24, SP, #15\n"
+    "ld1w { z17.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x648aa90f  // bfcvtnt z15.h, p2/M, z8.s\n"
+    "addvl x24, x24, #-3\n"
+    "ld1w { z18.s }, p2/Z, [x20]\n"
+    "mov x20, x21\n"
+    "st1h { z15.h }, p2, [x24]\n"
+    ".inst 0x648aaa2e  // bfcvtnt z14.h, p2/M, z17.s\n"
+    "ld1w { z29.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aabb5  // bfcvt z21.h, p2/M, z29.s\n"
+    "incb x21\n"
+    "ld1w { z17.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "st1h { z14.h }, p2, [x24, #1, MUL VL]\n"
+    ".inst 0x658aaa58  // bfcvt z24.h, p2/M, z18.s\n"
+    "ld1w { z26.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aab41  // bfcvt z1.h, p2/M, z26.s\n"
+    ".inst 0x648aaa35  // bfcvtnt z21.h, p2/M, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "st1h { z24.h }, p2, [x24, #2, MUL VL]\n"
+    "addvl x24, x24, #-3\n"
+    "ld1w { z9.s }, p2/Z, [x20]\n"
+    "mov x20, x21\n"
+    "st1h { z21.h }, p2, [x24]\n"
+    ".inst 0x648aaa21  // bfcvtnt z1.h, p2/M, z17.s\n"
+    "ld1w { z3.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "incb x21\n"
+    ".inst 0x658aa864  // bfcvt z4.h, p2/M, z3.s\n"
+    "ld1w { z31.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aa92b  // bfcvt z11.h, p2/M, z9.s\n"
+    "st1h { z1.h }, p2, [x24, #1, MUL VL]\n"
+    "ld1w { z18.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aaa46  // bfcvt z6.h, p2/M, z18.s\n"
+    "st1h { z11.h }, p2, [x24, #2, MUL VL]\n"
+    "ld1w { z5.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "addvl x24, x24, #-3\n"
+    ".inst 0x648aabe4  // bfcvtnt z4.h, p2/M, z31.s\n"
+    "ld1w { z27.s }, p2/Z, [x20]\n"
+    "mov x20, x21\n"
+    "st1h { z4.h }, p2, [x24]\n"
+    ".inst 0x648aa8a6  // bfcvtnt z6.h, p2/M, z5.s\n"
+    "ld1w { z9.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aa938  // bfcvt z24.h, p2/M, z9.s\n"
+    "incb x21\n"
+    "ld1w { z17.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aab75  // bfcvt z21.h, p2/M, z27.s\n"
+    "st1h { z6.h }, p2, [x24, #1, MUL VL]\n"
+    "ld1w { z31.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x648aaa38  // bfcvtnt z24.h, p2/M, z17.s\n"
+    ".inst 0x658aabf9  // bfcvt z25.h, p2/M, z31.s\n"
+    "ld1w { z18.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "st1h { z21.h }, p2, [x24, #2, MUL VL]\n"
+    "ld1w { z11.s }, p2/Z, [x20]\n"
+    "mov x21, x21\n"
+    "addvl x24, x24, #-3\n"
+    "st1h { z24.h }, p2, [x24]\n"
+    "ld1w { z17.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #5\n"
+    ".inst 0x648aaa59  // bfcvtnt z25.h, p2/M, z18.s\n"
+    "st1h { z25.h }, p2, [x24, #1, MUL VL]\n"
+    "ld1w { z8.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #5\n"
+    ".inst 0x658aaa29  // bfcvt z9.h, p2/M, z17.s\n"
+    ".inst 0x658aa976  // bfcvt z22.h, p2/M, z11.s\n"
+    "ld1w { z28.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #5\n"
+    ".inst 0x658aab85  // bfcvt z5.h, p2/M, z28.s\n"
+    "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ld1w { z25.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #5\n"
+    "sub x20, x7, #0x1\n"
+    "st1h { z22.h }, p2, [x24, #2, MUL VL]\n"
+    "ld1w { z11.s }, p2/Z, [x21]\n"
+    "orr x23, x20, %x[ld_in_col], LSL #18\n"
+    "addvl x24, x24, #-3\n"
+    "mov z17.d, z16.d\n"
+    "orr x23, x5, x23, LSL #20\n"
+    "mov x22, #0xb\n"
+    "mov z18.d, z16.d\n"
+    "mov z19.d, z16.d\n"
+    "add x21, x4, x3\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    ".inst 0x648aa909  // bfcvtnt z9.h, p2/M, z8.s\n"
+    "st1h { z9.h }, p2, [x24]\n"
+    ".inst 0x648aab25  // bfcvtnt z5.h, p2/M, z25.s\n"
+    "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
+    ".inst 0x658aa97b  // bfcvt z27.h, p2/M, z11.s\n"
+    "mov x8, #0x0\n"
+    "st1h { z27.h }, p2, [x24, #2, MUL VL]\n"
+    "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "lsl x23, x23, #0x2\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x4, x17\n"
+    "3:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col], LSL #2\n"
+    "bgt 3b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "msub x17, x4, x20, x17\n"
+    ".inst 0xc0040e00  // mova za.d[x8, #0], { z16.d-z19.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040e01  // mova za.d[x8, #1], { z16.d-z19.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x15, x14, [x23], #0x10\n"
+    ".inst 0xc0040e02  // mova za.d[x8, #2], { z16.d-z19.d }\n"
+    "ldp x13, x11, [x20], #0x10\n"
+    ".inst 0xc0040e03  // mova za.d[x8, #3], { z16.d-z19.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040e04  // mova za.d[x8, #4], { z16.d-z19.d }\n"
+    "ldp x10, x9, [x23], #0x10\n"
+    "ldp x28, x27, [x20], #0x10\n"
+    "cbz x21, 5f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 5f\n"
+    ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+    "and x22, x21, #0x1\n"
+    "add x21, x21, #0x1\n"
+    ".inst 0xc1acc9a4  // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+    "lsr x21, x21, #0x1\n"
+    "sub x16, x16, x21\n"
+    "4:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1w { z4.s }, p1, [x15]\n"
+    "add x15, x15, x13, LSL #2\n"
+    "st1w { z5.s }, p1, [x14]\n"
+    "add x14, x14, x11, LSL #2\n"
+    "st1w { z6.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z7.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "bgt 4b\n"
+    "5:"  // Left padding: End
+    "adds XZR, x4, x3\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x7, x7, x22\n"
+    "beq 9f\n"
+    "cmp x22, #0x2\n"
+    "beq 8f\n"
+    "cmp x22, #0x3\n"
+    "beq 7f\n"
+    "6:"  // Unpadded: 4 priming loads
+    "add x21, x17, %x[ld_in_row], LSL #2\n"
+    "ld1w { z0.s }, p1/Z, [x17]\n"
+    ".inst 0x658aa816  // bfcvt z22.h, p2/M, z0.s\n"
+    "addvl x20, SP, #12\n"
+    "ld1w { z9.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa936  // bfcvtnt z22.h, p2/M, z9.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1w { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab97  // bfcvt z23.h, p2/M, z28.s\n"
+    "ld1w { z20.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa97  // bfcvtnt z23.h, p2/M, z20.s\n"
+    "ld1w { z20.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa98  // bfcvt z24.h, p2/M, z20.s\n"
+    "ld1w { z29.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aabb8  // bfcvtnt z24.h, p2/M, z29.s\n"
+    "ld1w { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aabd9  // bfcvt z25.h, p2/M, z30.s\n"
+    "ld1w { z9.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa939  // bfcvtnt z25.h, p2/M, z9.s\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab5a  // bfcvt z26.h, p2/M, z26.s\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc13312d0  // bfdot za.s[x8, 0], { z22.h-z25.h }, z3.h\n"
+    "ld1w { z9.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa93a  // bfcvtnt z26.h, p2/M, z9.s\n"
+    ".inst 0xc13b12f0  // bfdot za.s[x8, 0], { z23.h-z26.h }, z11.h\n"
+    "ld1w { z9.s }, p1/Z, [x21]\n"
+    ".inst 0x658aa93b  // bfcvt z27.h, p2/M, z9.s\n"
+    "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1391310  // bfdot za.s[x8, 0], { z24.h-z27.h }, z9.h\n"
+    "7:"  // Unpadded: 3 priming loads
+    "add x21, x17, %x[ld_in_row], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x17]\n"
+    ".inst 0x658aab7d  // bfcvt z29.h, p2/M, z27.s\n"
+    "addvl x20, SP, #9\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab5d  // bfcvtnt z29.h, p2/M, z26.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1w { z9.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa93e  // bfcvt z30.h, p2/M, z9.s\n"
+    "ld1w { z20.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa9e  // bfcvtnt z30.h, p2/M, z20.s\n"
+    "ld1w { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab3f  // bfcvt z31.h, p2/M, z25.s\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab5f  // bfcvtnt z31.h, p2/M, z26.s\n"
+    "ld1w { z27.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab60  // bfcvt z0.h, p2/M, z27.s\n"
+    "ld1w { z9.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa920  // bfcvtnt z0.h, p2/M, z9.s\n"
+    "ld1w { z23.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaae1  // bfcvt z1.h, p2/M, z23.s\n"
+    ".inst 0xa0402a84  // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc13413b0  // bfdot za.s[x8, 0], { z29.h-z0.h }, z4.h\n"
+    "ld1w { z9.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa921  // bfcvtnt z1.h, p2/M, z9.s\n"
+    ".inst 0xc13513d0  // bfdot za.s[x8, 0], { z30.h-z1.h }, z5.h\n"
+    "ld1w { z29.s }, p1/Z, [x21]\n"
+    ".inst 0x658aaba2  // bfcvt z2.h, p2/M, z29.s\n"
+    "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc13913f0  // bfdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x22, x17, %x[ld_in_row], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x17]\n"
+    ".inst 0x658aab7a  // bfcvt z26.h, p2/M, z27.s\n"
+    "addvl x21, SP, #6\n"
+    "ld1w { z21.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaaba  // bfcvtnt z26.h, p2/M, z21.s\n"
+    "addvl x20, SP, #12\n"
+    "ld1w { z25.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab3b  // bfcvt z27.h, p2/M, z25.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1w { z4.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa89b  // bfcvtnt z27.h, p2/M, z4.s\n"
+    "ld1w { z10.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa95c  // bfcvt z28.h, p2/M, z10.s\n"
+    "ld1w { z4.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa89c  // bfcvtnt z28.h, p2/M, z4.s\n"
+    "ld1w { z5.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa8bd  // bfcvt z29.h, p2/M, z5.s\n"
+    "ld1w { z5.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa8bd  // bfcvtnt z29.h, p2/M, z5.s\n"
+    "ld1w { z5.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa8be  // bfcvt z30.h, p2/M, z5.s\n"
+    ".inst 0xa0402aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc13e1350  // bfdot za.s[x8, 0], { z26.h-z29.h }, z14.h\n"
+    "ld1w { z5.s }, p1/Z, [x22]\n"
+    ".inst 0x648aa8be  // bfcvtnt z30.h, p2/M, z5.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13f1370  // bfdot za.s[x8, 0], { z27.h-z30.h }, z15.h\n"
+    ".inst 0xa0402a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1381351  // bfdot za.s[x8, 1], { z26.h-z29.h }, z8.h\n"
+    "ld1w { z23.s }, p1/Z, [x22]\n"
+    ".inst 0x658aaaff  // bfcvt z31.h, p2/M, z23.s\n"
+    ".inst 0xc1391371  // bfdot za.s[x8, 1], { z27.h-z30.h }, z9.h\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1301390  // bfdot za.s[x8, 0], { z28.h-z31.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1301391  // bfdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x22, x17, %x[ld_in_row], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x17]\n"
+    ".inst 0x658aab77  // bfcvt z23.h, p2/M, z27.s\n"
+    "addvl x21, SP, #3\n"
+    "ld1w { z24.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab17  // bfcvtnt z23.h, p2/M, z24.s\n"
+    "addvl x20, SP, #9\n"
+    "ld1w { z31.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aabf8  // bfcvt z24.h, p2/M, z31.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1w { z6.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa8d8  // bfcvtnt z24.h, p2/M, z6.s\n"
+    "ld1w { z28.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab99  // bfcvt z25.h, p2/M, z28.s\n"
+    "ld1w { z26.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab59  // bfcvtnt z25.h, p2/M, z26.s\n"
+    "ld1w { z28.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab9a  // bfcvt z26.h, p2/M, z28.s\n"
+    "ld1w { z4.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa89a  // bfcvtnt z26.h, p2/M, z4.s\n"
+    "ld1w { z20.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa9b  // bfcvt z27.h, p2/M, z20.s\n"
+    ".inst 0xa1402aa0  // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc13012f0  // bfdot za.s[x8, 0], { z23.h-z26.h }, z0.h\n"
+    "ld1w { z20.s }, p1/Z, [x22]\n"
+    ".inst 0x648aaa9b  // bfcvtnt z27.h, p2/M, z20.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1381310  // bfdot za.s[x8, 0], { z24.h-z27.h }, z8.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc13212f1  // bfdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+    "ld1w { z11.s }, p1/Z, [x22]\n"
+    ".inst 0x658aa97c  // bfcvt z28.h, p2/M, z11.s\n"
+    ".inst 0xc1331311  // bfdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+    "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1341330  // bfdot za.s[x8, 0], { z25.h-z28.h }, z4.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1301331  // bfdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "10:"  // Unpadded: 0 priming loads
+    "cmp x7, #0x2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "blt 20f\n"
+    "add x21, x17, %x[ld_in_row], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x17]\n"
+    ".inst 0x658aab75  // bfcvt z21.h, p2/M, z27.s\n"
+    "sub x7, x7, #0x2\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "sub x16, x16, #0x1\n"
+    ".inst 0x648aab55  // bfcvtnt z21.h, p2/M, z26.s\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab56  // bfcvt z22.h, p2/M, z26.s\n"
+    "lsr x20, x7, #0x1\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "cmp x20, x16\n"
+    ".inst 0x648aab56  // bfcvtnt z22.h, p2/M, z26.s\n"
+    "ld1w { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa917  // bfcvt z23.h, p2/M, z8.s\n"
+    "csel x26, x20, x16, LT\n"
+    "ld1w { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa857  // bfcvtnt z23.h, p2/M, z2.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1w { z6.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa8d8  // bfcvt z24.h, p2/M, z6.s\n"
+    "and x7, x7, #0x1\n"
+    "ld1w { z15.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa9f8  // bfcvtnt z24.h, p2/M, z15.s\n"
+    "sub x16, x16, x26\n"
+    "ld1w { z27.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab79  // bfcvt z25.h, p2/M, z27.s\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab59  // bfcvtnt z25.h, p2/M, z26.s\n"
+    "ld1w { z27.s }, p1/Z, [x21]\n"
+    ".inst 0x658aab7a  // bfcvt z26.h, p2/M, z27.s\n"
+    "cbz x26, 19f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc13312b0  // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
+    "addvl x25, SP, #6\n"
+    "addvl x24, SP, #12\n"
+    "ld1w { z14.s }, p1/Z, [x17]\n"
+    ".inst 0xc13b12d0  // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
+    ".inst 0xa1402b20  // ld1h { z0.h, z8.h }, pn10.b/Z, [x25]\n"
+    "add x23, x17, %x[ld_in_row], LSL #2\n"
+    "addvl x22, SP, #3\n"
+    ".inst 0xc13012b1  // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1w { z27.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13812d1  // bfdot za.s[x8, 1], { z22.h-z25.h }, z8.h\n"
+    ".inst 0xa1402b00  // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+    "addvl x21, SP, #9\n"
+    "add x20, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13012b2  // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
+    "ld1w { z2.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa9d5  // bfcvt z21.h, p2/M, z14.s\n"
+    ".inst 0xc13712f0  // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, #2, MUL VL]\n"
+    ".inst 0x648aab75  // bfcvtnt z21.h, p2/M, z27.s\n"
+    "subs x26, x26, #0x1\n"
+    "ld1w { z14.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13812d2  // bfdot za.s[x8, 2], { z22.h-z25.h }, z8.h\n"
+    ".inst 0x658aa856  // bfcvt z22.h, p2/M, z2.s\n"
+    "ld1w { z7.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13b12f1  // bfdot za.s[x8, 1], { z23.h-z26.h }, z11.h\n"
+    ".inst 0x648aa9d6  // bfcvtnt z22.h, p2/M, z14.s\n"
+    "ld1w { z31.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+    ".inst 0xc1acc9a8  // fclamp { z8.s-z11.s }, z13.s, z12.s\n"
+    "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+    ".inst 0xc13012f2  // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
+    ".inst 0x658aa8f7  // bfcvt z23.h, p2/M, z7.s\n"
+    "add x8, x8, #0x1\n"
+    "ld1w { z26.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab58  // bfcvt z24.h, p2/M, z26.s\n"
+    ".inst 0x648aabf7  // bfcvtnt z23.h, p2/M, z31.s\n"
+    "ld1w { z2.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa858  // bfcvtnt z24.h, p2/M, z2.s\n"
+    "st1w { z8.s }, p1, [x15]\n"
+    "ld1w { z0.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa819  // bfcvt z25.h, p2/M, z0.s\n"
+    "add x15, x15, x13, LSL #2\n"
+    ".inst 0xa0402ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc13212b0  // bfdot za.s[x8, 0], { z21.h-z24.h }, z2.h\n"
+    "st1w { z9.s }, p1, [x14]\n"
+    "add x14, x14, x11, LSL #2\n"
+    "ld1w { z26.s }, p1/Z, [x23]\n"
+    ".inst 0x648aab59  // bfcvtnt z25.h, p2/M, z26.s\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13312d0  // bfdot za.s[x8, 0], { z22.h-z25.h }, z3.h\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc13112b1  // bfdot za.s[x8, 1], { z21.h-z24.h }, z1.h\n"
+    "st1w { z10.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "ld1w { z26.s }, p1/Z, [x23]\n"
+    ".inst 0x658aab5a  // bfcvt z26.h, p2/M, z26.s\n"
+    ".inst 0xc13912d1  // bfdot za.s[x8, 1], { z22.h-z25.h }, z9.h\n"
+    "ld1w { z31.s }, p1/Z, [x17]\n"
+    ".inst 0x658aabf5  // bfcvt z21.h, p2/M, z31.s\n"
+    "st1w { z11.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "ld1w { z30.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0040e04  // mova za.d[x8, #4], { z16.d-z19.d }\n"
+    ".inst 0x648aabd5  // bfcvtnt z21.h, p2/M, z30.s\n"
+    "ld1w { z0.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa816  // bfcvt z22.h, p2/M, z0.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1w { z1.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa836  // bfcvtnt z22.h, p2/M, z1.s\n"
+    "ld1w { z11.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1h { z2.h }, p2/Z, [x22, #2, MUL VL]\n"
+    ".inst 0xc13212f0  // bfdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z14.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc13412f1  // bfdot za.s[x8, 1], { z23.h-z26.h }, z4.h\n"
+    ".inst 0x658aa977  // bfcvt z23.h, p2/M, z11.s\n"
+    "ld1w { z29.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa9d8  // bfcvt z24.h, p2/M, z14.s\n"
+    ".inst 0x658aabb9  // bfcvt z25.h, p2/M, z29.s\n"
+    "ld1w { z5.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab97  // bfcvtnt z23.h, p2/M, z28.s\n"
+    ".inst 0x648aab78  // bfcvtnt z24.h, p2/M, z27.s\n"
+    "ld1w { z11.s }, p1/Z, [x20]\n"
+    ".inst 0x648aa8b9  // bfcvtnt z25.h, p2/M, z5.s\n"
+    ".inst 0x658aa97a  // bfcvt z26.h, p2/M, z11.s\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "bgt 11b\n"
+    "b 19f\n"
+    "12:"  // Padded
+    "cbz x22, 17f\n"
+    "cmp x22, #0x1\n"
+    "sub x7, x7, x22\n"
+    "beq 16f\n"
+    "cmp x22, #0x2\n"
+    "beq 15f\n"
+    "cmp x22, #0x3\n"
+    "beq 14f\n"
+    "13:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z1.s }, p0/Z, [x17]\n"
+    ".inst 0x658aa837  // bfcvt z23.h, p2/M, z1.s\n"
+    "add x21, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z29.s }, p0/Z, [x21]\n"
+    ".inst 0x648aabb7  // bfcvtnt z23.h, p2/M, z29.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z30.s }, p0/Z, [x21]\n"
+    ".inst 0x658aabd8  // bfcvt z24.h, p2/M, z30.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z15.s }, p0/Z, [x21]\n"
+    ".inst 0x648aa9f8  // bfcvtnt z24.h, p2/M, z15.s\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab79  // bfcvt z25.h, p2/M, z27.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z20.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa99  // bfcvtnt z25.h, p2/M, z20.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z10.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa95a  // bfcvt z26.h, p2/M, z10.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z8.s }, p0/Z, [x21]\n"
+    ".inst 0x648aa91a  // bfcvtnt z26.h, p2/M, z8.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z28.s }, p0/Z, [x21]\n"
+    ".inst 0x658aab9b  // bfcvt z27.h, p2/M, z28.s\n"
+    "addvl x20, SP, #12\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc13112f0  // bfdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z28.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab9b  // bfcvtnt z27.h, p2/M, z28.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z0.s }, p0/Z, [x21]\n"
+    ".inst 0x658aa81c  // bfcvt z28.h, p2/M, z0.s\n"
+    ".inst 0xc1391310  // bfdot za.s[x8, 0], { z24.h-z27.h }, z9.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1301330  // bfdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
+    "14:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z21.s }, p0/Z, [x17]\n"
+    ".inst 0x658aaab4  // bfcvt z20.h, p2/M, z21.s\n"
+    "add x21, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    ".inst 0x648aab74  // bfcvtnt z20.h, p2/M, z27.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    ".inst 0x658aab75  // bfcvt z21.h, p2/M, z27.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    ".inst 0x648aab75  // bfcvtnt z21.h, p2/M, z27.s\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z29.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aabb6  // bfcvt z22.h, p2/M, z29.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab76  // bfcvtnt z22.h, p2/M, z27.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab77  // bfcvt z23.h, p2/M, z27.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z8.s }, p0/Z, [x21]\n"
+    ".inst 0x648aa917  // bfcvtnt z23.h, p2/M, z8.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z28.s }, p0/Z, [x21]\n"
+    ".inst 0x658aab98  // bfcvt z24.h, p2/M, z28.s\n"
+    "addvl x20, SP, #9\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1311290  // bfdot za.s[x8, 0], { z20.h-z23.h }, z1.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z0.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa818  // bfcvtnt z24.h, p2/M, z0.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z1.s }, p0/Z, [x21]\n"
+    ".inst 0x658aa839  // bfcvt z25.h, p2/M, z1.s\n"
+    ".inst 0xc13912b0  // bfdot za.s[x8, 0], { z21.h-z24.h }, z9.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13012d0  // bfdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
+    "15:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z6.s }, p0/Z, [x17]\n"
+    ".inst 0x658aa8da  // bfcvt z26.h, p2/M, z6.s\n"
+    "add x22, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z29.s }, p0/Z, [x22]\n"
+    ".inst 0x648aabba  // bfcvtnt z26.h, p2/M, z29.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z28.s }, p0/Z, [x22]\n"
+    ".inst 0x658aab9b  // bfcvt z27.h, p2/M, z28.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z14.s }, p0/Z, [x22]\n"
+    ".inst 0x648aa9db  // bfcvtnt z27.h, p2/M, z14.s\n"
+    "mov x12, #0x4\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z24.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab1c  // bfcvt z28.h, p2/M, z24.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z1.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa83c  // bfcvtnt z28.h, p2/M, z1.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z3.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa87d  // bfcvt z29.h, p2/M, z3.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z0.s }, p0/Z, [x22]\n"
+    ".inst 0x648aa81d  // bfcvtnt z29.h, p2/M, z0.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z24.s }, p0/Z, [x22]\n"
+    ".inst 0x658aab1e  // bfcvt z30.h, p2/M, z24.s\n"
+    "addvl x21, SP, #6\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1311350  // bfdot za.s[x8, 0], { z26.h-z29.h }, z1.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z23.s }, p0/Z, [x22]\n"
+    ".inst 0x648aaafe  // bfcvtnt z30.h, p2/M, z23.s\n"
+    "addvl x20, SP, #12\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1391370  // bfdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "ld1w { z31.s }, p0/Z, [x22]\n"
+    ".inst 0xc1301351  // bfdot za.s[x8, 1], { z26.h-z29.h }, z0.h\n"
+    ".inst 0x658aabff  // bfcvt z31.h, p2/M, z31.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1311371  // bfdot za.s[x8, 1], { z27.h-z30.h }, z1.h\n"
+    ".inst 0xc1301390  // bfdot za.s[x8, 0], { z28.h-z31.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1301391  // bfdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+    "16:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z22.s }, p0/Z, [x17]\n"
+    ".inst 0x658aaad5  // bfcvt z21.h, p2/M, z22.s\n"
+    "add x22, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z3.s }, p0/Z, [x22]\n"
+    ".inst 0x648aa875  // bfcvtnt z21.h, p2/M, z3.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z20.s }, p0/Z, [x22]\n"
+    ".inst 0x658aaa96  // bfcvt z22.h, p2/M, z20.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z25.s }, p0/Z, [x22]\n"
+    ".inst 0x648aab36  // bfcvtnt z22.h, p2/M, z25.s\n"
+    "mov x12, #0x4\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z24.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab17  // bfcvt z23.h, p2/M, z24.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z0.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa817  // bfcvtnt z23.h, p2/M, z0.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z7.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa8f8  // bfcvt z24.h, p2/M, z7.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z28.s }, p0/Z, [x22]\n"
+    ".inst 0x648aab98  // bfcvtnt z24.h, p2/M, z28.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z6.s }, p0/Z, [x22]\n"
+    ".inst 0x658aa8d9  // bfcvt z25.h, p2/M, z6.s\n"
+    "addvl x21, SP, #3\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc13112b0  // bfdot za.s[x8, 0], { z21.h-z24.h }, z1.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z6.s }, p0/Z, [x22]\n"
+    ".inst 0x648aa8d9  // bfcvtnt z25.h, p2/M, z6.s\n"
+    "addvl x20, SP, #9\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc13912d0  // bfdot za.s[x8, 0], { z22.h-z25.h }, z9.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "ld1w { z3.s }, p0/Z, [x22]\n"
+    ".inst 0xc13012b1  // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    ".inst 0x658aa87a  // bfcvt z26.h, p2/M, z3.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc13112d1  // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    ".inst 0xc13012f0  // bfdot za.s[x8, 0], { z23.h-z26.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc13012f1  // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "17:"  // Padded: 0 priming loads
+    "cmp x7, #0x2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "blt 20f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z25.s }, p0/Z, [x17]\n"
+    ".inst 0x658aab35  // bfcvt z21.h, p2/M, z25.s\n"
+    "add x20, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    ".inst 0x648aab75  // bfcvtnt z21.h, p2/M, z27.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    ".inst 0x658aab76  // bfcvt z22.h, p2/M, z27.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    ".inst 0x648aab76  // bfcvtnt z22.h, p2/M, z27.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab77  // bfcvt z23.h, p2/M, z27.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab37  // bfcvtnt z23.h, p2/M, z25.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab58  // bfcvt z24.h, p2/M, z26.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    ".inst 0x648aab78  // bfcvtnt z24.h, p2/M, z27.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    ".inst 0x658aab79  // bfcvt z25.h, p2/M, z27.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    ".inst 0x648aab59  // bfcvtnt z25.h, p2/M, z26.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    ".inst 0x658aab7a  // bfcvt z26.h, p2/M, z27.s\n"
+    "sub x7, x7, #0x2\n"
+    "sub x16, x16, #0x1\n"
+    "lsr x20, x7, #0x1\n"
+    "cmp x20, x16\n"
+    "csel x24, x20, x16, LT\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "and x7, x7, #0x1\n"
+    "sub x16, x16, x24\n"
+    "cbz x24, 19f\n"
+    "18:"  // Padded: Main loop
+    ".inst 0xc13312b0  // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
+    "addvl x23, SP, #6\n"
+    "addvl x21, SP, #12\n"
+    ".inst 0xc13b12d0  // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13012b1  // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1w { z9.s }, p0/Z, [x17]\n"
+    "add x20, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc13112d1  // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    "addvl x22, SP, #3\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13012b2  // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
+    "ld1w { z14.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc13712f0  // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
+    "mov x12, #0x4\n"
+    "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+    ".inst 0xc13112d2  // bfdot za.s[x8, 2], { z22.h-z25.h }, z1.h\n"
+    ".inst 0x658aa921  // bfcvt z1.h, p2/M, z9.s\n"
+    "ld1w { z15.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13012f1  // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "ld1w { z9.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0x658aab62  // bfcvt z2.h, p2/M, z27.s\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0x648aa9c1  // bfcvtnt z1.h, p2/M, z14.s\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc13012f2  // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
+    ".inst 0x658aa923  // bfcvt z3.h, p2/M, z9.s\n"
+    "addvl x21, SP, #9\n"
+    "ld1w { z9.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0x658aa924  // bfcvt z4.h, p2/M, z9.s\n"
+    "mov x12, #0x8\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa9e2  // bfcvtnt z2.h, p2/M, z15.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z9.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab63  // bfcvtnt z3.h, p2/M, z27.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0x648aab04  // bfcvtnt z4.h, p2/M, z24.s\n"
+    ".inst 0x658aa925  // bfcvt z5.h, p2/M, z9.s\n"
+    ".inst 0xa1402ac0  // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+    "ld1w { z30.s }, p0/Z, [x20]\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0x648aabc5  // bfcvtnt z5.h, p2/M, z30.s\n"
+    ".inst 0xc1301030  // bfdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z29.s }, p0/Z, [x20]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1381050  // bfdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    "ld1w { z0.s }, p0/Z, [x17]\n"
+    "add x20, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1361031  // bfdot za.s[x8, 1], { z1.h-z4.h }, z6.h\n"
+    "ld1w { z10.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0x658aaba6  // bfcvt z6.h, p2/M, z29.s\n"
+    "ld1w { z9.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc13e1051  // bfdot za.s[x8, 1], { z2.h-z5.h }, z14.h\n"
+    "mov x12, #0x4\n"
+    "ld1w { z29.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa815  // bfcvt z21.h, p2/M, z0.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z31.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa936  // bfcvt z22.h, p2/M, z9.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+    ".inst 0xc1301070  // bfdot za.s[x8, 0], { z3.h-z6.h }, z0.h\n"
+    "subs x24, x24, #0x1\n"
+    "ld1w { z15.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1acc9b8  // fclamp { z24.s-z27.s }, z13.s, z12.s\n"
+    "ld1w { z30.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "st1w { z24.s }, p1, [x15]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z14.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "st1w { z25.s }, p1, [x14]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1301071  // bfdot za.s[x8, 1], { z3.h-z6.h }, z0.h\n"
+    ".inst 0x658aabf7  // bfcvt z23.h, p2/M, z31.s\n"
+    "ld1w { z8.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0x658aabd8  // bfcvt z24.h, p2/M, z30.s\n"
+    "ld1w { z4.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0x658aa919  // bfcvt z25.h, p2/M, z8.s\n"
+    "ld1w { z5.s }, p0/Z, [x20]\n"
+    "add x15, x15, x13, LSL #2\n"
+    "add x14, x14, x11, LSL #2\n"
+    "st1w { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z27.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc0040e04  // mova za.d[x8, #4], { z16.d-z19.d }\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    ".inst 0x648aa955  // bfcvtnt z21.h, p2/M, z10.s\n"
+    ".inst 0x648aabb6  // bfcvtnt z22.h, p2/M, z29.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+    ".inst 0x648aa9f7  // bfcvtnt z23.h, p2/M, z15.s\n"
+    ".inst 0x648aa9d8  // bfcvtnt z24.h, p2/M, z14.s\n"
+    ".inst 0x648aa899  // bfcvtnt z25.h, p2/M, z4.s\n"
+    ".inst 0x658aa8ba  // bfcvt z26.h, p2/M, z5.s\n"
+    "bgt 18b\n"
+    "19:"  // Main loop tail
+    ".inst 0xc13312b0  // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
+    "addvl x24, SP, #6\n"
+    "addvl x23, SP, #12\n"
+    ".inst 0xc13b12d0  // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
+    ".inst 0xa0402b00  // ld1h { z0.h-z1.h }, pn10.b/Z, [x24]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13012b1  // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1w { z5.s }, p0/Z, [x17]\n"
+    "add x22, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc13112d1  // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #3\n"
+    "addvl x20, SP, #9\n"
+    ".inst 0xc13012b2  // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
+    "ld1w { z29.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z2.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc13712f0  // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
+    "mov x12, #0x4\n"
+    "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+    ".inst 0xc13112d2  // bfdot za.s[x8, 2], { z22.h-z25.h }, z1.h\n"
+    ".inst 0x658aa8bb  // bfcvt z27.h, p2/M, z5.s\n"
+    "ld1w { z20.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13012f1  // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "ld1w { z1.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0x658aa85c  // bfcvt z28.h, p2/M, z2.s\n"
+    "ld1w { z14.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0x648aabbb  // bfcvtnt z27.h, p2/M, z29.s\n"
+    "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+    ".inst 0xc13012f2  // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
+    ".inst 0x658aa83d  // bfcvt z29.h, p2/M, z1.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1w { z1.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0x658aa83e  // bfcvt z30.h, p2/M, z1.s\n"
+    "mov x12, #0x8\n"
+    "ld1w { z31.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa9c  // bfcvtnt z28.h, p2/M, z20.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z26.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa9dd  // bfcvtnt z29.h, p2/M, z14.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0x648aabfe  // bfcvtnt z30.h, p2/M, z31.s\n"
+    ".inst 0x658aab5f  // bfcvt z31.h, p2/M, z26.s\n"
+    ".inst 0xa1402aa2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+    "ld1w { z9.s }, p0/Z, [x22]\n"
+    ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0x648aa93f  // bfcvtnt z31.h, p2/M, z9.s\n"
+    ".inst 0xc1321370  // bfdot za.s[x8, 0], { z27.h-z30.h }, z2.h\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z26.s }, p0/Z, [x22]\n"
+    ".inst 0xc13a1390  // bfdot za.s[x8, 0], { z28.h-z31.h }, z10.h\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x658aab40  // bfcvt z0.h, p2/M, z26.s\n"
+    ".inst 0xc1321371  // bfdot za.s[x8, 1], { z27.h-z30.h }, z2.h\n"
+    "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1acc9a4  // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+    ".inst 0xc13a1391  // bfdot za.s[x8, 1], { z28.h-z31.h }, z10.h\n"
+    "st1w { z4.s }, p1, [x15]\n"
+    "add x15, x15, x13, LSL #2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc13913b0  // bfdot za.s[x8, 0], { z29.h-z0.h }, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+    "st1w { z5.s }, p1, [x14]\n"
+    "add x14, x14, x11, LSL #2\n"
+    "st1w { z6.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    ".inst 0xc0040e04  // mova za.d[x8, #4], { z16.d-z19.d }\n"
+    "st1w { z7.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc13913b1  // bfdot za.s[x8, 1], { z29.h-z0.h }, z9.h\n"
+    "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "20:"  // Main loop skip tail
+    "cbz x7, 21f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z25.s }, p0/Z, [x17]\n"
+    ".inst 0x658aab3d  // bfcvt z29.h, p2/M, z25.s\n"
+    "add x22, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z26.s }, p0/Z, [x22]\n"
+    ".inst 0x648aab5d  // bfcvtnt z29.h, p2/M, z26.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z25.s }, p0/Z, [x22]\n"
+    ".inst 0x658aab3e  // bfcvt z30.h, p2/M, z25.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z24.s }, p0/Z, [x22]\n"
+    ".inst 0x648aab1e  // bfcvtnt z30.h, p2/M, z24.s\n"
+    "mov x12, #0x4\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z26.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab5f  // bfcvt z31.h, p2/M, z26.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z9.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa93f  // bfcvtnt z31.h, p2/M, z9.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z9.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa920  // bfcvt z0.h, p2/M, z9.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z24.s }, p0/Z, [x22]\n"
+    ".inst 0x648aab00  // bfcvtnt z0.h, p2/M, z24.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z9.s }, p0/Z, [x22]\n"
+    ".inst 0x658aa921  // bfcvt z1.h, p2/M, z9.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z25.s }, p0/Z, [x22]\n"
+    ".inst 0x648aab21  // bfcvtnt z1.h, p2/M, z25.s\n"
+    ".inst 0xc13313b0  // bfdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+    "addvl x21, SP, #6\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13b13d0  // bfdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+    ".inst 0xa0402aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "addvl x20, SP, #12\n"
+    ".inst 0xc13e13b1  // bfdot za.s[x8, 1], { z29.h-z0.h }, z14.h\n"
+    "ld1w { z25.s }, p0/Z, [x22]\n"
+    ".inst 0x658aab22  // bfcvt z2.h, p2/M, z25.s\n"
+    "sub x16, x16, #0x1\n"
+    ".inst 0xc13f13d1  // bfdot za.s[x8, 1], { z30.h-z1.h }, z15.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc13e13b2  // bfdot za.s[x8, 2], { z29.h-z0.h }, z14.h\n"
+    ".inst 0xc13713f0  // bfdot za.s[x8, 0], { z31.h-z2.h }, z7.h\n"
+    "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc13f13d2  // bfdot za.s[x8, 2], { z30.h-z1.h }, z15.h\n"
+    ".inst 0xc13413f1  // bfdot za.s[x8, 1], { z31.h-z2.h }, z4.h\n"
+    "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+    ".inst 0xc1acc9a4  // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+    "st1w { z4.s }, p1, [x15]\n"
+    "add x15, x15, x13, LSL #2\n"
+    ".inst 0xc13913f2  // bfdot za.s[x8, 2], { z31.h-z2.h }, z9.h\n"
+    "add x8, x8, #0x1\n"
+    "st1w { z5.s }, p1, [x14]\n"
+    "add x14, x14, x11, LSL #2\n"
+    "st1w { z6.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    ".inst 0xc0040e04  // mova za.d[x8, #4], { z16.d-z19.d }\n"
+    "st1w { z7.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "21:"  // Tail input: End
+    "cbz x16, 23f\n"
+    "22:"  // Right padding loop
+    ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "subs x16, x16, #0x1\n"
+    ".inst 0xc1acc9a4  // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+    "st1w { z4.s }, p1, [x15]\n"
+    "add x15, x15, x13, LSL #2\n"
+    ".inst 0xc0040e04  // mova za.d[x8, #4], { z16.d-z19.d }\n"
+    "st1w { z5.s }, p1, [x14]\n"
+    "add x14, x14, x11, LSL #2\n"
+    "st1w { z6.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z7.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "bgt 22b\n"
+    "23:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incb x20, ALL, MUL #16\n"
+    "incb x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x6\n"
+    "whilelt p1.s, x6, x5\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21, LSL #2\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #15\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..de3eadac8a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
+  const int8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  int8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_s8q_planar_3x3_s1_4rows_dot_za : public PlanarStrategy<int8_t, int8_t>
+{
+  using Parent = PlanarStrategy<int8_t, int8_t>;
+
+  public:
+  using return_type = int8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_s8q_planar_3x3_s1_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_s8q_planar_3x3_s1_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..845f376926
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,664 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
+  const int8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  int8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const int8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const int8_t *weights;
+    long unsigned int input_cols, output_cols;
+    int8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x6\n"
+    "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x6\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x17\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x7\n"
+    "addvl SP, SP, #-12\n"
+    "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z21.h, p2/M, z21.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z30.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1sb { z10.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z7.h, #0x0\n"
+    "sub z10.h, z10.h, z31.h\n"
+    "incw x22\n"
+    "ld1sb { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z16.h, z16.h, z31.h\n"
+    "trn1 z20.h, z7.h, z10.h\n"
+    "ld1sb { z11.s }, p2/Z, [x20]\n"
+    "sub z11.h, z11.h, z31.h\n"
+    "mov x20, x22\n"
+    "trn1 z19.h, z10.h, z16.h\n"
+    "ld1sb { z24.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "trn1 z26.h, z16.h, z11.h\n"
+    "trn1 z13.h, z11.h, z7.h\n"
+    "ld1sb { z11.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z24.h, z24.h, z31.h\n"
+    "sub z11.h, z11.h, z31.h\n"
+    "ld1sb { z2.s }, p2/Z, [x20]\n"
+    "sub z2.h, z2.h, z31.h\n"
+    "addvl x21, SP, #12\n"
+    "incw x22\n"
+    "addvl x21, x21, #-4\n"
+    "mov x20, x22\n"
+    "st1h { z20.h }, p2, [x21]\n"
+    "trn1 z22.h, z7.h, z24.h\n"
+    "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z1.h, z24.h, z11.h\n"
+    "ld1sb { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z3.h, z11.h, z2.h\n"
+    "ld1sb { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
+    "trn1 z25.h, z2.h, z7.h\n"
+    "ld1sb { z4.s }, p2/Z, [x20]\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "sub z16.h, z16.h, z31.h\n"
+    "sub z0.h, z0.h, z31.h\n"
+    "addvl x21, x21, #-4\n"
+    "st1h { z22.h }, p2, [x21]\n"
+    "sub z4.h, z4.h, z31.h\n"
+    "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
+    "mov z31.d, z30.d\n"
+    "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z24.h, z7.h, z16.h\n"
+    "trn1 z18.h, z16.h, z0.h\n"
+    "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
+    "addvl x21, x21, #-4\n"
+    "trn1 z0.h, z0.h, z4.h\n"
+    "trn1 z1.h, z4.h, z7.h\n"
+    "st1h { z24.h }, p2, [x21]\n"
+    "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
+    "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+    "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x15, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x17, x23, LSL #22\n"
+    "mov x22, #0x6\n"
+    "add x21, x7, x6\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x7, x14\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x14, x7, x20, x14\n"
+    ".inst 0xc0040bc0  // mova za.d[x8, #0], { z30.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040bc1  // mova za.d[x8, #1], { z30.d-z31.d }\n"
+    "mov x22, #0x2\n"
+    "ldp x11, x10, [x23], #0x10\n"
+    ".inst 0xc0040bc2  // mova za.d[x8, #2], { z30.d-z31.d }\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    ".inst 0xc0040bc3  // mova za.d[x8, #3], { z30.d-z31.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    "ldp x27, x26, [x23], #0x10\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "sub x13, x13, x21\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x7, x6\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1sb { z20.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #8\n"
+    "ld1sb { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z20.h, z16.h\n"
+    "add z4.h, z4.h, z21.h\n"
+    "ld1sb { z23.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1sb { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z5.h, z23.h, z22.h\n"
+    "add z5.h, z5.h, z21.h\n"
+    "ld1sb { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z16.s }, p1/Z, [x21]\n"
+    "trn1 z6.h, z17.h, z16.h\n"
+    "add z6.h, z6.h, z21.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16b1488  // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
+    ".inst 0xc1631489  // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
+    ".inst 0xa1412a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16814a8  // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
+    ".inst 0xc16014a9  // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x22, x14, %x[ld_in_row]\n"
+    "ld1sb { z25.s }, p1/Z, [x14]\n"
+    "addvl x21, SP, #4\n"
+    "ld1sb { z6.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z3.h, z25.h, z6.h\n"
+    "add z3.h, z3.h, z21.h\n"
+    "ld1sb { z18.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #8\n"
+    "ld1sb { z26.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z4.h, z18.h, z26.h\n"
+    "add z4.h, z4.h, z21.h\n"
+    "ld1sb { z2.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1sb { z5.s }, p1/Z, [x22]\n"
+    "trn1 z5.h, z2.h, z5.h\n"
+    "add z5.h, z5.h, z21.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1611468  // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
+    ".inst 0xc1601469  // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xa0412aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a146a  // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
+    ".inst 0xc162146b  // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
+    ".inst 0xc1691488  // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
+    ".inst 0xc1681489  // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
+    ".inst 0xa1412a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a148a  // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
+    ".inst 0xc162148b  // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
+    "10:"  // Unpadded: 0 priming loads
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "cbz x15, 18f\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "ld1sb { z17.s }, p1/Z, [x14]\n"
+    "sub x15, x15, #0x1\n"
+    "ld1sb { z9.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z6.h, z17.h, z9.h\n"
+    "sub x13, x13, #0x1\n"
+    "ld1sb { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "cmp x15, x13\n"
+    "add z6.h, z6.h, z21.h\n"
+    "ld1sb { z7.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z7.h, z17.h, z7.h\n"
+    "csel x23, x15, x13, LT\n"
+    "ld1sb { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z7.h, z7.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1sb { z1.s }, p1/Z, [x20]\n"
+    "trn1 z8.h, z17.h, z1.h\n"
+    "add z8.h, z8.h, z21.h\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    "addvl x22, SP, #4\n"
+    "addvl x21, SP, #8\n"
+    "ld1sb { z2.s }, p1/Z, [x14]\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa1402ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    "ld1sb { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa1412ac3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "ld1sb { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc16d14ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
+    "ld1sb { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16514cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc16914cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+    "ld1sb { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16114cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+    "ld1sb { z16.s }, p1/Z, [x20]\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc16b14ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+    "trn1 z6.h, z2.h, z19.h\n"
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc16314eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+    ".inst 0xa1412aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    ".inst 0xc16914ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "add z6.h, z6.h, z21.h\n"
+    ".inst 0xc16114ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
+    "trn1 z7.h, z23.h, z18.h\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "add z7.h, z7.h, z21.h\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "add z8.h, z8.h, z21.h\n"
+    "bgt 11b\n"
+    "b 17f\n"
+    "12:"  // Padded
+    "cbz x22, 15f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 14f\n"
+    "13:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z7.h, z19.h, z18.h\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #8\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    "trn1 z9.h, z17.h, z16.h\n"
+    ".inst 0xc16a14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16214e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
+    ".inst 0xa1412a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16d1508  // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
+    ".inst 0xc1651509  // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
+    "14:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z19.h, z18.h\n"
+    "trn1 z23.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #4\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #8\n"
+    "trn1 z24.h, z17.h, z16.h\n"
+    ".inst 0xc16116c8  // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+    ".inst 0xc16016c9  // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xa0412aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16d16ca  // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
+    ".inst 0xc16516cb  // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
+    ".inst 0xc16116e8  // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
+    ".inst 0xc16016e9  // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+    ".inst 0xa0412a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16116ea  // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
+    ".inst 0xc16016eb  // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
+    "15:"  // Padded: 0 priming loads
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "cbz x15, 18f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z6.h, z19.h, z18.h\n"
+    "trn1 z7.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "sub x15, x15, #0x1\n"
+    "sub x13, x13, #0x1\n"
+    "cmp x15, x13\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    "csel x23, x15, x13, LT\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "16:"  // Padded: Main loop
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z9.s }, p0/Z, [x14]\n"
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    "add z9.h, p0/M, z9.h, z21.h\n"
+    "add x22, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z19.s }, p0/Z, [x22]\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z18.s }, p0/Z, [x22]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x22]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    "mov x12, #0x4\n"
+    "addvl x21, SP, #4\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #8\n"
+    ".inst 0xc16b14ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
+    "subs x23, x23, #0x1\n"
+    "ld1sb { z17.s }, p0/Z, [x22]\n"
+    ".inst 0xc16314cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xa0412aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16d14cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    "ld1sb { z2.s }, p0/Z, [x22]\n"
+    ".inst 0xc16514cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
+    "add z2.h, p0/M, z2.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16b14ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    ".inst 0xc16a14eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa1412a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc16b14ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z6.h, z9.h, z19.h\n"
+    ".inst 0xc16314ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "trn1 z7.h, z18.h, z16.h\n"
+    "trn1 z8.h, z17.h, z2.h\n"
+    "bgt 16b\n"
+    "17:"  // Main loop tail
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    "addvl x21, SP, #4\n"
+    "addvl x20, SP, #8\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa0412aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc16114ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc16014cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc16914cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc16114cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc16314ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    ".inst 0xc16214eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
+    ".inst 0xa0412a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    ".inst 0xc16114ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
+    ".inst 0xc16014ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "18:"  // Main loop skip tail
+    "cbz x13, 20f\n"
+    "19:"  // Right padding loop
+    ".inst 0xc0060804  // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+    "subs x13, x13, #0x1\n"
+    ".inst 0xc0060826  // mova { z6.d-z7.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc1acaa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc1afab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    ".inst 0xc1bccfa4  // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
+    "st1b { z4.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z6.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z5.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z7.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 19b\n"
+    "20:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x16\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x16, x17\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #12\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..56fb127aa0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
+  const int8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  int8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_s8q_planar_3x3_s2_4rows_dot_za : public PlanarStrategy<int8_t, int8_t>
+{
+  using Parent = PlanarStrategy<int8_t, int8_t>;
+
+  public:
+  using return_type = int8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_s8q_planar_3x3_s2_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_s8q_planar_3x3_s2_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..1d0efc6bc1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,881 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
+  const int8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  int8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const int8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const int8_t *weights;
+    long unsigned int input_cols, output_cols;
+    int8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x9\n"
+    "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x6\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x17\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x7\n"
+    "addvl SP, SP, #-6\n"
+    "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z11.h, p2/M, z11.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z28.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1sb { z26.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "sub z26.h, z26.h, z16.h\n"
+    "incw x22\n"
+    "mov z24.h, #0x0\n"
+    "ld1sb { z3.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z3.h, z3.h, z16.h\n"
+    "trn1 z31.h, z26.h, z3.h\n"
+    "ld1sb { z21.s }, p2/Z, [x20]\n"
+    "sub z21.h, z21.h, z16.h\n"
+    "mov x20, x22\n"
+    "trn1 z14.h, z21.h, z24.h\n"
+    "ld1sb { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z2.h, z2.h, z16.h\n"
+    "addvl x21, SP, #6\n"
+    "ld1sb { z25.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z25.h, z25.h, z16.h\n"
+    "incw x22\n"
+    "ld1sb { z27.s }, p2/Z, [x20]\n"
+    "sub z27.h, z27.h, z16.h\n"
+    "addvl x21, x21, #-2\n"
+    "mov x20, x22\n"
+    "st1h { z31.h }, p2, [x21]\n"
+    "trn1 z4.h, z2.h, z25.h\n"
+    "ld1sb { z26.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1sb { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z12.h, z27.h, z24.h\n"
+    "ld1sb { z20.s }, p2/Z, [x20]\n"
+    "sub z26.h, z26.h, z16.h\n"
+    "sub z23.h, z23.h, z16.h\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "sub z20.h, z20.h, z16.h\n"
+    "addvl x21, x21, #-2\n"
+    "st1h { z4.h }, p2, [x21]\n"
+    "mov z29.d, z28.d\n"
+    "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
+    "addvl x21, x21, #-2\n"
+    "mov z30.d, z28.d\n"
+    "mov z31.d, z28.d\n"
+    "trn1 z25.h, z26.h, z23.h\n"
+    "st1h { z25.h }, p2, [x21]\n"
+    "trn1 z3.h, z20.h, z24.h\n"
+    "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x15, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x17, x23, LSL #22\n"
+    "mov x22, #0x9\n"
+    "add x21, x7, x6\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x7, x14\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x14, x7, x20, x14\n"
+    ".inst 0xc0040f80  // mova za.d[x8, #0], { z28.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040f81  // mova za.d[x8, #1], { z28.d-z31.d }\n"
+    "mov x22, #0x2\n"
+    "ldp x11, x10, [x23], #0x10\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    "ldp x27, x26, [x23], #0x10\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+    "and x22, x21, #0x1\n"
+    ".inst 0xc1a9aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
+    "add x21, x21, #0x1\n"
+    "lsr x21, x21, #0x1\n"
+    ".inst 0xc1adab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
+    "sub x13, x13, x21\n"
+    ".inst 0xc1a7cd58  // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z25.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z26.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x7, x6\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1sb { z1.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #4\n"
+    "ld1sb { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z21.h\n"
+    "add z1.h, z1.h, z11.h\n"
+    "ld1sb { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1sb { z15.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z2.h, z2.h, z15.h\n"
+    "add z2.h, z2.h, z11.h\n"
+    "ld1sb { z3.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z3.h, z3.h, z21.h\n"
+    "add z3.h, z3.h, z11.h\n"
+    "ld1sb { z4.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z4.h, z19.h\n"
+    "add z4.h, z4.h, z11.h\n"
+    "ld1sb { z8.s }, p1/Z, [x21]\n"
+    "mov z5.d, z8.d\n"
+    "add z5.h, z5.h, z11.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701428  // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+    ".inst 0xc1781448  // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1sb { z1.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #2\n"
+    "ld1sb { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z21.h\n"
+    "add z1.h, z1.h, z11.h\n"
+    "ld1sb { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1sb { z12.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z2.h, z2.h, z12.h\n"
+    "add z2.h, z2.h, z11.h\n"
+    "ld1sb { z3.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z3.h, z3.h, z8.h\n"
+    "add z3.h, z3.h, z11.h\n"
+    "ld1sb { z4.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z5.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z4.h, z5.h\n"
+    "add z4.h, z4.h, z11.h\n"
+    "ld1sb { z5.s }, p1/Z, [x21]\n"
+    "mov z5.d, z5.d\n"
+    "add z5.h, z5.h, z11.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701428  // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+    ".inst 0xc1781448  // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+    "10:"  // Unpadded: 0 priming loads
+    "cmp x15, #0x2\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "blt 18f\n"
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1sb { z21.s }, p1/Z, [x14]\n"
+    "sub x15, x15, #0x2\n"
+    "ld1sb { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z8.h\n"
+    "sub x13, x13, #0x1\n"
+    "ld1sb { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "lsr x20, x15, #0x1\n"
+    "add z21.h, z21.h, z11.h\n"
+    "ld1sb { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z25.h\n"
+    "cmp x20, x13\n"
+    "ld1sb { z23.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "csel x23, x20, x13, LT\n"
+    "add z22.h, z22.h, z11.h\n"
+    "ld1sb { z18.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z18.h\n"
+    "add z23.h, z23.h, z11.h\n"
+    "ld1sb { z24.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1sb { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z19.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "ld1sb { z8.s }, p1/Z, [x21]\n"
+    "mov z25.d, z8.d\n"
+    "add z25.h, z25.h, z11.h\n"
+    "and x15, x15, #0x1\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "add x22, x14, %x[ld_in_row]\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "addvl x21, SP, #2\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1sb { z21.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "ld1sb { z18.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "trn1 z21.h, z21.h, z18.h\n"
+    "ld1sb { z22.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z21.h, z21.h, z11.h\n"
+    ".inst 0xc0060c00  // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+    "ld1sb { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z8.h\n"
+    "add z22.h, z22.h, z11.h\n"
+    "ld1sb { z23.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "ld1sb { z27.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z27.h\n"
+    "add z23.h, z23.h, z11.h\n"
+    "ld1sb { z24.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc1a6ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+    "ld1sb { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z8.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "ld1sb { z4.s }, p1/Z, [x22]\n"
+    "mov z25.d, z4.d\n"
+    "add z25.h, z25.h, z11.h\n"
+    ".inst 0xa1402aa4  // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17416a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
+    ".inst 0xc1a9aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+    "ld1sb { z21.s }, p1/Z, [x14]\n"
+    ".inst 0xc17c16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
+    ".inst 0xc1adab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+    "ld1sb { z12.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z12.h\n"
+    ".inst 0xc1a7cd40  // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+    "ld1sb { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "st1b { z0.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "ld1sb { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z20.h\n"
+    "st1b { z1.s }, p1, [x10]\n"
+    "ld1sb { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x10, x10, x28\n"
+    "st1b { z2.s }, p1, [x27]\n"
+    "ld1sb { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z24.h\n"
+    "add x27, x27, x25\n"
+    "ld1sb { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "st1b { z3.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "ld1sb { z3.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z3.h\n"
+    "add z21.h, z21.h, z11.h\n"
+    "ld1sb { z3.s }, p1/Z, [x20]\n"
+    "mov z25.d, z3.d\n"
+    "add z22.h, z22.h, z11.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "add z23.h, z23.h, z11.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "add z25.h, z25.h, z11.h\n"
+    "bgt 11b\n"
+    "b 17f\n"
+    "12:"  // Padded
+    "cbz x22, 15f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 14f\n"
+    "13:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z22.s }, p0/Z, [x14]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z4.s }, p0/Z, [x20]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z17.h\n"
+    "trn1 z23.h, z23.h, z4.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z11.h\n"
+    "addvl x20, SP, #4\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    ".inst 0xa1402a84  // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+    "mov z26.d, z1.d\n"
+    ".inst 0xc17416c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17c16e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
+    "14:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z22.s }, p0/Z, [x14]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z5.s }, p0/Z, [x20]\n"
+    "add z5.h, p0/M, z5.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z17.h\n"
+    "trn1 z23.h, z23.h, z5.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z15.s }, p0/Z, [x20]\n"
+    "add z15.h, p0/M, z15.h, z11.h\n"
+    "addvl x20, SP, #2\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "mov z26.d, z15.d\n"
+    ".inst 0xc17016c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17116e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
+    "15:"  // Padded: 0 priming loads
+    "cmp x15, #0x2\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "blt 18f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z18.h\n"
+    "trn1 z22.h, z22.h, z3.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z19.s }, p0/Z, [x20]\n"
+    "add z19.h, p0/M, z19.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z20.h, p0/M, z20.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "sub x15, x15, #0x2\n"
+    "sub x13, x13, #0x1\n"
+    "trn1 z23.h, z23.h, z19.h\n"
+    "trn1 z24.h, z24.h, z20.h\n"
+    "lsr x20, x15, #0x1\n"
+    "cmp x20, x13\n"
+    "mov z25.d, z3.d\n"
+    "csel x22, x20, x13, LT\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "and x15, x15, #0x1\n"
+    "sub x13, x13, x22\n"
+    "cbz x22, 17f\n"
+    "16:"  // Padded: Main loop
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa1402a84  // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "add x21, x14, %x[ld_in_row]\n"
+    ".inst 0xc17416a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
+    "ld1sb { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z14.s }, p0/Z, [x21]\n"
+    "add z14.h, p0/M, z14.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17c16c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
+    "ld1sb { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z15.s }, p0/Z, [x21]\n"
+    "mov x12, #0x4\n"
+    "add z15.h, p0/M, z15.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z23.s }, p0/Z, [x21]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z24.s }, p0/Z, [x21]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z4.s }, p0/Z, [x21]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "mov x12, #0x8\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z14.h\n"
+    "trn1 z22.h, z22.h, z15.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "addvl x20, SP, #2\n"
+    "ld1sb { z2.s }, p0/Z, [x21]\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z4.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "add z2.h, p0/M, z2.h, z11.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17016a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "mov z25.d, z2.d\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17116c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z4.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z24.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z12.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z12.h, p0/M, z12.h, z11.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z11.h\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    "subs x22, x22, #0x1\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "trn1 z21.h, z21.h, z20.h\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z22.h, z22.h, z4.h\n"
+    "trn1 z23.h, z23.h, z27.h\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "trn1 z24.h, z24.h, z12.h\n"
+    "mov z25.d, z8.d\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "bgt 16b\n"
+    "17:"  // Main loop tail
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1sb { z0.s }, p0/Z, [x14]\n"
+    "add z0.h, p0/M, z0.h, z11.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z14.s }, p0/Z, [x20]\n"
+    "add z14.h, p0/M, z14.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "ld1sb { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z12.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add z12.h, p0/M, z12.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z2.s }, p0/Z, [x20]\n"
+    "add z2.h, p0/M, z2.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z3.s }, p0/Z, [x20]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z25.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #2\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    "trn1 z0.h, z0.h, z14.h\n"
+    "add x8, x8, #0x1\n"
+    "add z27.h, p0/M, z27.h, z11.h\n"
+    "trn1 z1.h, z1.h, z12.h\n"
+    "trn1 z2.h, z2.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "trn1 z3.h, z3.h, z25.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    "mov z4.d, z27.d\n"
+    ".inst 0xc17e1408  // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc17f1428  // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "18:"  // Main loop skip tail
+    "cbz x15, 19f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z0.s }, p0/Z, [x20]\n"
+    "add z0.h, p0/M, z0.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z17.h\n"
+    "trn1 z22.h, z22.h, z0.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z5.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z5.h, p0/M, z5.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z4.s }, p0/Z, [x20]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z5.h\n"
+    "mov z25.d, z4.d\n"
+    "addvl x20, SP, #4\n"
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "sub x13, x13, #0x1\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "19:"  // Tail input: End
+    "cbz x13, 21f\n"
+    "20:"  // Right padding loop
+    ".inst 0xc0060c00  // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a9aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+    "subs x13, x13, #0x1\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    ".inst 0xc1adab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+    ".inst 0xc1a7cd40  // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+    "st1b { z0.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z1.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z2.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z3.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 20b\n"
+    "21:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x16\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x16, x17\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #6\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..40fa718266
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
+  const int8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  int8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_s8q_planar_5x5_s1_4rows_dot_za : public PlanarStrategy<int8_t, int8_t>
+{
+  using Parent = PlanarStrategy<int8_t, int8_t>;
+
+  public:
+  using return_type = int8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_s8q_planar_5x5_s1_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_s8q_planar_5x5_s1_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..bb68733a45
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,1204 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
+  const int8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  int8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const int8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const int8_t *weights;
+    long unsigned int input_cols, output_cols;
+    int8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x8\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x5\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x7\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x6\n"
+    "addvl SP, SP, #-30\n"
+    "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z17.h, p2/M, z17.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z18.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x23\n"
+    "ld1sb { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z15.h, #0x0\n"
+    "sub z2.h, z2.h, z3.h\n"
+    "incw x23\n"
+    "ld1sb { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z13.h, z13.h, z3.h\n"
+    "trn1 z11.h, z15.h, z2.h\n"
+    "ld1sb { z27.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z27.h, z27.h, z3.h\n"
+    "trn1 z0.h, z2.h, z13.h\n"
+    "ld1sb { z19.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z19.h, z19.h, z3.h\n"
+    "trn1 z26.h, z13.h, z27.h\n"
+    "ld1sb { z14.s }, p2/Z, [x20]\n"
+    "sub z14.h, z14.h, z3.h\n"
+    "mov x20, x23\n"
+    "trn1 z10.h, z27.h, z19.h\n"
+    "ld1sb { z9.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z19.h, z19.h, z14.h\n"
+    "trn1 z1.h, z14.h, z15.h\n"
+    "ld1sb { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z9.h, z9.h, z3.h\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "ld1sb { z29.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z29.h, z29.h, z3.h\n"
+    "addvl x22, SP, #30\n"
+    "ld1sb { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "incw x23\n"
+    "sub z2.h, z2.h, z3.h\n"
+    "ld1sb { z23.s }, p2/Z, [x20]\n"
+    "addvl x22, x22, #-6\n"
+    "sub z23.h, z23.h, z3.h\n"
+    "mov x20, x23\n"
+    "st1h { z11.h }, p2, [x22]\n"
+    "trn1 z20.h, z15.h, z9.h\n"
+    "incw x23\n"
+    "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z22.h, z9.h, z5.h\n"
+    "ld1sb { z25.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z9.h, z5.h, z29.h\n"
+    "ld1sb { z21.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z26.h, z29.h, z2.h\n"
+    "ld1sb { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z28.h, z2.h, z23.h\n"
+    "ld1sb { z19.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z2.h, z23.h, z15.h\n"
+    "sub z25.h, z25.h, z3.h\n"
+    "addvl x22, x22, #-6\n"
+    "sub z21.h, z21.h, z3.h\n"
+    "ld1sb { z6.s }, p2/Z, [x20]\n"
+    "sub z0.h, z0.h, z3.h\n"
+    "mov x20, x23\n"
+    "sub z19.h, z19.h, z3.h\n"
+    "sub z6.h, z6.h, z3.h\n"
+    "st1h { z20.h }, p2, [x22]\n"
+    "incw x23\n"
+    "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z11.h, z15.h, z25.h\n"
+    "trn1 z10.h, z25.h, z21.h\n"
+    "ld1sb { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z14.h, z21.h, z0.h\n"
+    "ld1sb { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z21.h, z0.h, z19.h\n"
+    "ld1sb { z27.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z19.h, z19.h, z6.h\n"
+    "ld1sb { z29.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z13.h, z6.h, z15.h\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "sub z23.h, z23.h, z3.h\n"
+    "ld1sb { z1.s }, p2/Z, [x20]\n"
+    "addvl x22, x22, #-6\n"
+    "sub z27.h, z27.h, z3.h\n"
+    "sub z29.h, z29.h, z3.h\n"
+    "mov x20, x23\n"
+    "st1h { z11.h }, p2, [x22]\n"
+    "sub z1.h, z1.h, z3.h\n"
+    "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z30.h, z15.h, z5.h\n"
+    "trn1 z26.h, z5.h, z23.h\n"
+    "ld1sb { z11.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z22.h, z23.h, z27.h\n"
+    "ld1sb { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z28.h, z27.h, z29.h\n"
+    "ld1sb { z8.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z27.h, z29.h, z1.h\n"
+    "ld1sb { z9.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z2.h, z1.h, z15.h\n"
+    "ld1sb { z14.s }, p2/Z, [x20]\n"
+    "sub z11.h, z11.h, z3.h\n"
+    "addvl x22, x22, #-6\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "sub z8.h, z8.h, z3.h\n"
+    "st1h { z30.h }, p2, [x22]\n"
+    "sub z9.h, z9.h, z3.h\n"
+    "sub z14.h, z14.h, z3.h\n"
+    "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
+    "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+    "mov z19.d, z18.d\n"
+    "trn1 z22.h, z15.h, z11.h\n"
+    "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z1.h, z11.h, z5.h\n"
+    "trn1 z31.h, z5.h, z8.h\n"
+    "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z8.h, z8.h, z9.h\n"
+    "trn1 z21.h, z9.h, z14.h\n"
+    "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+    "addvl x22, x22, #-6\n"
+    "trn1 z15.h, z14.h, z15.h\n"
+    "st1h { z22.h }, p2, [x22]\n"
+    "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
+    "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
+    "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
+    "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
+    "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+    "cbz x21, 3f\n"
+    "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x25, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x7, x23, LSL #22\n"
+    "mov x22, #0x8\n"
+    "add x21, x6, x5\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x11, #0x0\n"
+    "mov x8, #0x8\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x6, x16\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x16, x6, x20, x16\n"
+    ".inst 0xc0046a40  // mova za.d[x11, #0], { z18.d-z19.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0046a41  // mova za.d[x11, #1], { z18.d-z19.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x14, x13, [x23], #0x10\n"
+    ".inst 0xc0046a42  // mova za.d[x11, #2], { z18.d-z19.d }\n"
+    "ldp x4, x10, [x20], #0x10\n"
+    ".inst 0xc0046a43  // mova za.d[x11, #3], { z18.d-z19.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0046a44  // mova za.d[x11, #4], { z18.d-z19.d }\n"
+    "ldp x9, x28, [x23], #0x10\n"
+    ".inst 0xc0046a45  // mova za.d[x11, #5], { z18.d-z19.d }\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    ".inst 0xc0046a46  // mova za.d[x11, #6], { z18.d-z19.d }\n"
+    ".inst 0xc0046a47  // mova za.d[x11, #7], { z18.d-z19.d }\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0066814  // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+    "sub x15, x15, x21\n"
+    ".inst 0xc0066836  // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+    ".inst 0xc1a4aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
+    ".inst 0xc1acab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
+    ".inst 0xc1b0cf14  // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z20.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z22.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z21.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z23.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x6, x5\n"
+    "bne 14f\n"
+    "cbz x22, 12f\n"
+    "cmp x22, #0x1\n"
+    "sub x25, x25, x22\n"
+    "beq 11f\n"
+    "cmp x22, #0x2\n"
+    "beq 10f\n"
+    "cmp x22, #0x3\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 4 priming loads
+    "add x21, x16, %x[ld_in_row]\n"
+    "ld1sb { z1.s }, p1/Z, [x16]\n"
+    "addvl x20, SP, #24\n"
+    "ld1sb { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z1.h, z28.h\n"
+    "add z27.h, z27.h, z17.h\n"
+    "ld1sb { z1.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1sb { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z28.h, z1.h, z2.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "ld1sb { z13.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z6.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z13.h, z6.h\n"
+    "add z29.h, z29.h, z17.h\n"
+    "ld1sb { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16a7768  // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+    "ld1sb { z20.s }, p1/Z, [x21]\n"
+    "trn1 z30.h, z30.h, z20.h\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "add z30.h, z30.h, z17.h\n"
+    ".inst 0xc1697788  // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc1617789  // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16b77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
+    ".inst 0xc16a77a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+    "9:"  // Unpadded: 3 priming loads
+    "add x22, x16, %x[ld_in_row]\n"
+    "ld1sb { z2.s }, p1/Z, [x16]\n"
+    "addvl x21, SP, #18\n"
+    "ld1sb { z28.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z20.h, z2.h, z28.h\n"
+    "add z20.h, z20.h, z17.h\n"
+    "ld1sb { z31.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    "ld1sb { z11.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z21.h, z31.h, z11.h\n"
+    "add z21.h, z21.h, z17.h\n"
+    "ld1sb { z25.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1sb { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z22.h, z25.h, z8.h\n"
+    "add z22.h, z22.h, z17.h\n"
+    "ld1sb { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16e7688  // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+    "ld1sb { z3.s }, p1/Z, [x22]\n"
+    "trn1 z23.h, z8.h, z3.h\n"
+    ".inst 0xc1667689  // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc161768a  // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+    "add z23.h, z23.h, z17.h\n"
+    ".inst 0xa1412aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc160768b  // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+    ".inst 0xc16976a8  // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
+    ".inst 0xa0422aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16176a9  // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16976aa  // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
+    ".inst 0xc16176ab  // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
+    ".inst 0xc16f76c8  // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
+    ".inst 0xc16e76c9  // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16b76ca  // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
+    ".inst 0xc16a76cb  // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
+    "10:"  // Unpadded: 2 priming loads
+    "add x23, x16, %x[ld_in_row]\n"
+    "ld1sb { z2.s }, p1/Z, [x16]\n"
+    "addvl x22, SP, #12\n"
+    "ld1sb { z22.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z0.h, z2.h, z22.h\n"
+    "add z0.h, z0.h, z17.h\n"
+    "ld1sb { z14.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "addvl x21, SP, #18\n"
+    "ld1sb { z6.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z1.h, z14.h, z6.h\n"
+    "add z1.h, z1.h, z17.h\n"
+    "ld1sb { z15.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    "ld1sb { z6.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z2.h, z15.h, z6.h\n"
+    "add z2.h, z2.h, z17.h\n"
+    "ld1sb { z21.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16f7408  // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
+    "ld1sb { z30.s }, p1/Z, [x23]\n"
+    "trn1 z3.h, z21.h, z30.h\n"
+    ".inst 0xc16e7409  // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
+    ".inst 0xa1402aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16d740a  // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
+    "add z3.h, z3.h, z17.h\n"
+    ".inst 0xa0412ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc165740b  // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
+    ".inst 0xa0402a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16f7428  // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e7429  // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xa0412aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xa0422ac8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16b740c  // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
+    ".inst 0xc16a740d  // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
+    ".inst 0xc16f742a  // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e742b  // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xa0412a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1697448  // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
+    ".inst 0xc1687449  // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
+    ".inst 0xa0422aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f742c  // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e742d  // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xc16b744a  // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
+    ".inst 0xc16a744b  // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
+    ".inst 0xa0422a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc161744c  // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
+    ".inst 0xc160744d  // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
+    "11:"  // Unpadded: 1 priming loads
+    "add x24, x16, %x[ld_in_row]\n"
+    "ld1sb { z0.s }, p1/Z, [x16]\n"
+    "addvl x23, SP, #6\n"
+    "ld1sb { z3.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z28.h, z0.h, z3.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "ld1sb { z6.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x22, SP, #12\n"
+    "ld1sb { z30.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z29.h, z6.h, z30.h\n"
+    "add z29.h, z29.h, z17.h\n"
+    "ld1sb { z1.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x21, SP, #18\n"
+    "ld1sb { z25.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z30.h, z1.h, z25.h\n"
+    "add z30.h, z30.h, z17.h\n"
+    "ld1sb { z3.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    ".inst 0xc1617788  // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1sb { z5.s }, p1/Z, [x24]\n"
+    "trn1 z31.h, z3.h, z5.h\n"
+    ".inst 0xc1607789  // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16e778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
+    "add z31.h, z31.h, z17.h\n"
+    ".inst 0xa1412ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc166778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
+    ".inst 0xa0402aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16a77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
+    ".inst 0xc16277a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
+    ".inst 0xa0412ac8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xa1422ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16f778c  // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
+    ".inst 0xc16e778d  // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
+    ".inst 0xa1402a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16977aa  // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+    ".inst 0xc16877ab  // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a77c8  // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
+    ".inst 0xc16277c9  // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
+    ".inst 0xa1422ac2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16e778e  // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
+    ".inst 0xc166778f  // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
+    ".inst 0xc16d77ac  // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
+    ".inst 0xc16577ad  // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a77ca  // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
+    ".inst 0xc16277cb  // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
+    ".inst 0xa0422aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e77ae  // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
+    ".inst 0xc16677af  // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
+    ".inst 0xc16977cc  // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
+    ".inst 0xc16877cd  // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
+    ".inst 0xa1422a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16e77ce  // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
+    ".inst 0xc16677cf  // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
+    "12:"  // Unpadded: 0 priming loads
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "cbz x25, 22f\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    "ld1sb { z26.s }, p1/Z, [x16]\n"
+    "sub x25, x25, #0x1\n"
+    "ld1sb { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z26.h, z28.h\n"
+    "sub x15, x15, #0x1\n"
+    "ld1sb { z31.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "cmp x25, x15\n"
+    "add z25.h, z25.h, z17.h\n"
+    "ld1sb { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z31.h, z15.h\n"
+    "csel x25, x25, x15, LT\n"
+    "ld1sb { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z26.h, z26.h, z17.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1sb { z8.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z27.h, z22.h, z8.h\n"
+    "add z27.h, z27.h, z17.h\n"
+    "ld1sb { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "sub x15, x15, x25\n"
+    "ld1sb { z20.s }, p1/Z, [x20]\n"
+    "trn1 z28.h, z21.h, z20.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "cbz x25, 21f\n"
+    "13:"  // Unpadded: Main loop
+    "addvl x24, SP, #6\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x23, SP, #12\n"
+    "ld1sb { z21.s }, p1/Z, [x16]\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402b0e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
+    "addvl x22, SP, #18\n"
+    "addvl x21, SP, #24\n"
+    ".inst 0xc16f772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    "ld1sb { z0.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16e772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xa1402ae6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+    "subs x25, x25, #0x1\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    "ld1sb { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412b05  // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    "ld1sb { z31.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16d774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+    "ld1sb { z29.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc165774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    "ld1sb { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422b02  // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+    ".inst 0xc16f772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
+    "ld1sb { z30.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16e772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xa0402aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16d774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
+    "ld1sb { z6.s }, p1/Z, [x20]\n"
+    ".inst 0xc165774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16a776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ac1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc169776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f1728  // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+    ".inst 0xc16e1729  // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+    "trn1 z25.h, z21.h, z0.h\n"
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc16d1748  // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
+    "add z25.h, z25.h, z17.h\n"
+    ".inst 0xc1651749  // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
+    "trn1 z26.h, z20.h, z31.h\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xc16b1768  // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+    "add z26.h, z26.h, z17.h\n"
+    ".inst 0xc16a1769  // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+    "trn1 z27.h, z29.h, z22.h\n"
+    "trn1 z28.h, z30.h, z6.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "add z27.h, z27.h, z17.h\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "add z28.h, z28.h, z17.h\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 13b\n"
+    "b 21f\n"
+    "14:"  // Padded
+    "cbz x22, 19f\n"
+    "cmp x22, #0x1\n"
+    "sub x25, x25, x22\n"
+    "beq 18f\n"
+    "cmp x22, #0x2\n"
+    "beq 17f\n"
+    "cmp x22, #0x3\n"
+    "beq 16f\n"
+    "15:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z9.s }, p0/Z, [x16]\n"
+    "add z9.h, p0/M, z9.h, z17.h\n"
+    "add x21, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x21]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z9.h, z22.h\n"
+    "trn1 z0.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x21]\n"
+    "addvl x20, SP, #24\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    "trn1 z1.h, z22.h, z20.h\n"
+    "ld1sb { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16a77e8  // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc16277e9  // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
+    ".inst 0xa1412a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "trn1 z2.h, z21.h, z20.h\n"
+    ".inst 0xc16d7408  // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
+    ".inst 0xa0422a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1657409  // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
+    ".inst 0xc1697428  // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
+    ".inst 0xc1687429  // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
+    "16:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z5.s }, p0/Z, [x16]\n"
+    "add z5.h, p0/M, z5.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z28.h, z5.h, z22.h\n"
+    "trn1 z29.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #18\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    "trn1 z30.h, z22.h, z20.h\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #24\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc1617788  // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xc1607789  // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    "trn1 z31.h, z21.h, z20.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xa0412aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc169778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc161778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xa1422aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
+    ".inst 0xc16e77a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16977aa  // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+    ".inst 0xc16177ab  // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
+    ".inst 0xc16b77c8  // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
+    ".inst 0xc16377c9  // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+    ".inst 0xa0422a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f77ca  // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
+    ".inst 0xc16e77cb  // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
+    "17:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z29.s }, p0/Z, [x16]\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z8.h, z29.h, z22.h\n"
+    "trn1 z9.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x20]\n"
+    "addvl x22, SP, #12\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    "trn1 z10.h, z22.h, z20.h\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #18\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16f7508  // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
+    ".inst 0xc16e7509  // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #24\n"
+    "trn1 z11.h, z21.h, z20.h\n"
+    ".inst 0xa1412ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16e750a  // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc166750b  // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16d7528  // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
+    ".inst 0xa0422ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc1657529  // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16f750c  // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
+    ".inst 0xc16e750d  // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
+    ".inst 0xc16d752a  // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
+    ".inst 0xc165752b  // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1617548  // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
+    ".inst 0xc1607549  // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
+    ".inst 0xa0422aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e752c  // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
+    ".inst 0xc166752d  // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
+    ".inst 0xc161754a  // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
+    ".inst 0xc160754b  // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
+    ".inst 0xa0422a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f754c  // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xc16e754d  // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
+    "18:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z1.s }, p0/Z, [x16]\n"
+    "add z1.h, p0/M, z1.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z1.h, z22.h\n"
+    "trn1 z27.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x20]\n"
+    "addvl x23, SP, #6\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402aee  // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
+    "trn1 z28.h, z22.h, z20.h\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "addvl x22, SP, #12\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16f7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
+    ".inst 0xc16e7749  // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xa0402ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+    "addvl x21, SP, #18\n"
+    "trn1 z29.h, z21.h, z20.h\n"
+    ".inst 0xa0412aea  // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc161774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
+    "addvl x20, SP, #24\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc160774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16b7768  // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xa0422ae8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16a7769  // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa0412aca  // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16e774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc166774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16b776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa1412aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc1697788  // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc1687789  // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
+    ".inst 0xa1422ac2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xc16e776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
+    ".inst 0xc166776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
+    ".inst 0xc162778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
+    ".inst 0xa0422aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
+    ".inst 0xc166776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
+    ".inst 0xc161778c  // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xc160778d  // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1422a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16a778e  // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
+    ".inst 0xc162778f  // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
+    "19:"  // Padded: 0 priming loads
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "cbz x25, 22f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z6.s }, p0/Z, [x16]\n"
+    "add z6.h, p0/M, z6.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z30.s }, p0/Z, [x20]\n"
+    "add z30.h, p0/M, z30.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z6.h, z30.h\n"
+    "trn1 z26.h, z27.h, z26.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z9.s }, p0/Z, [x20]\n"
+    "add z9.h, p0/M, z9.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    "sub x25, x25, #0x1\n"
+    "sub x15, x15, #0x1\n"
+    "cmp x25, x15\n"
+    "trn1 z27.h, z8.h, z9.h\n"
+    "trn1 z28.h, z21.h, z29.h\n"
+    "csel x25, x25, x15, LT\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "sub x15, x15, x25\n"
+    "cbz x25, 21f\n"
+    "20:"  // Padded: Main loop
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z8.s }, p0/Z, [x16]\n"
+    "add z8.h, p0/M, z8.h, z17.h\n"
+    "add x24, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z21.s }, p0/Z, [x24]\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x23, SP, #6\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "addvl x22, SP, #12\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc161772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+    ".inst 0xc160772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    "addvl x21, SP, #18\n"
+    "addvl x20, SP, #24\n"
+    "ld1sb { z29.s }, p0/Z, [x24]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    "mov x12, #0x4\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    "ld1sb { z30.s }, p0/Z, [x24]\n"
+    "add z30.h, p0/M, z30.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "subs x25, x25, #0x1\n"
+    ".inst 0xc16d774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+    "ld1sb { z15.s }, p0/Z, [x24]\n"
+    "add z15.h, p0/M, z15.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc165774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa0412aca  // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    "ld1sb { z20.s }, p0/Z, [x24]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ae1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc16e772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+    "ld1sb { z31.s }, p0/Z, [x24]\n"
+    "add z31.h, p0/M, z31.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc166772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc16b774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
+    "ld1sb { z22.s }, p0/Z, [x24]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    ".inst 0xc16a774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
+    ".inst 0xa1412aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc169776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16e774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc166774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc161776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xc160776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+    ".inst 0xa1422aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc169776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1631728  // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
+    ".inst 0xc1621729  // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    "trn1 z25.h, z8.h, z21.h\n"
+    ".inst 0xc16e1748  // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc1661749  // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "trn1 z26.h, z29.h, z30.h\n"
+    ".inst 0xc16b1768  // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a1769  // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "trn1 z27.h, z15.h, z20.h\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    "trn1 z28.h, z31.h, z22.h\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 20b\n"
+    "21:"  // Main loop tail
+    "addvl x23, SP, #6\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x22, SP, #12\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #18\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xc161772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+    ".inst 0xc160772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc169774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc161774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
+    ".inst 0xa1412ac1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa0422aea  // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16e772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc166772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc169774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc161774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16b776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa0422ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa0412a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc163776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
+    ".inst 0xc162776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422aa2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16a776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa0422a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f1728  // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+    ".inst 0xc16e1729  // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc1691748  // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc1681749  // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
+    ".inst 0xc1611768  // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xc1601769  // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "22:"  // Main loop skip tail
+    "cbz x15, 24f\n"
+    "23:"  // Right padding loop
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    "add x8, x8, #0x2\n"
+    "subs x15, x15, #0x1\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 23b\n"
+    "24:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #16\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x17\n"
+    "whilelt p1.s, x17, x7\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #30\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..8bffc05e1f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
+  const int8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  int8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_s8q_planar_5x5_s2_4rows_dot_za : public PlanarStrategy<int8_t, int8_t>
+{
+  using Parent = PlanarStrategy<int8_t, int8_t>;
+
+  public:
+  using return_type = int8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_s8q_planar_5x5_s2_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_s8q_planar_5x5_s2_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..3da0d14d74
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,1354 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
+  const int8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  int8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const int8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const int8_t *weights;
+    long unsigned int input_cols, output_cols;
+    int8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 11u - std::min(11u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0xb\n"
+    "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x3\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x5\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x4\n"
+    "addvl SP, SP, #-15\n"
+    "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z7.h, p2/M, z7.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z12.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1sb { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "incw x22\n"
+    "mov z26.h, #0x0\n"
+    "ld1sb { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "trn1 z17.h, z13.h, z22.h\n"
+    "ld1sb { z20.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z20.h, z20.h, z28.h\n"
+    "addvl x21, SP, #15\n"
+    "ld1sb { z1.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z1.h, z1.h, z28.h\n"
+    "trn1 z29.h, z20.h, z1.h\n"
+    "ld1sb { z27.s }, p2/Z, [x20]\n"
+    "mov x20, x22\n"
+    "sub z27.h, z27.h, z28.h\n"
+    "incw x22\n"
+    "ld1sb { z14.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z14.h, z14.h, z28.h\n"
+    "addvl x21, x21, #-3\n"
+    "ld1sb { z18.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z18.h, z18.h, z28.h\n"
+    "trn1 z22.h, z27.h, z26.h\n"
+    "ld1sb { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z23.h, z23.h, z28.h\n"
+    "st1h { z17.h }, p2, [x21]\n"
+    "ld1sb { z30.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z30.h, z30.h, z28.h\n"
+    "trn1 z8.h, z14.h, z18.h\n"
+    "ld1sb { z15.s }, p2/Z, [x20]\n"
+    "mov x20, x22\n"
+    "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+    "sub z15.h, z15.h, z28.h\n"
+    "ld1sb { z20.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z23.h, z23.h, z30.h\n"
+    "sub z20.h, z20.h, z28.h\n"
+    "ld1sb { z24.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z24.h, z24.h, z28.h\n"
+    "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+    "ld1sb { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z0.h, z15.h, z26.h\n"
+    "incw x22\n"
+    "ld1sb { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z16.h, z16.h, z28.h\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "ld1sb { z11.s }, p2/Z, [x20]\n"
+    "addvl x21, x21, #-3\n"
+    "mov x20, x22\n"
+    "st1h { z8.h }, p2, [x21]\n"
+    "trn1 z27.h, z20.h, z24.h\n"
+    "ld1sb { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z11.h, z11.h, z28.h\n"
+    "ld1sb { z3.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z20.h, z16.h, z13.h\n"
+    "ld1sb { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "sub z3.h, z3.h, z28.h\n"
+    "ld1sb { z15.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z29.h, z11.h, z26.h\n"
+    "ld1sb { z16.s }, p2/Z, [x20]\n"
+    "incw x22\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "sub z15.h, z15.h, z28.h\n"
+    "addvl x21, x21, #-3\n"
+    "mov x20, x22\n"
+    "st1h { z27.h }, p2, [x21]\n"
+    "sub z16.h, z16.h, z28.h\n"
+    "trn1 z19.h, z22.h, z3.h\n"
+    "ld1sb { z17.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
+    "ld1sb { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z31.h, z13.h, z15.h\n"
+    "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+    "ld1sb { z18.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z16.h, z16.h, z26.h\n"
+    "sub z17.h, z17.h, z28.h\n"
+    "ld1sb { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z0.h, z0.h, z28.h\n"
+    "sub z18.h, z18.h, z28.h\n"
+    "ld1sb { z1.s }, p2/Z, [x20]\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "sub z1.h, z1.h, z28.h\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "addvl x21, x21, #-3\n"
+    "st1h { z19.h }, p2, [x21]\n"
+    "mov z13.d, z12.d\n"
+    "mov z14.d, z12.d\n"
+    "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+    "mov z15.d, z12.d\n"
+    "trn1 z8.h, z17.h, z0.h\n"
+    "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
+    "addvl x21, x21, #-3\n"
+    "trn1 z31.h, z18.h, z22.h\n"
+    "trn1 z29.h, z1.h, z26.h\n"
+    "st1h { z8.h }, p2, [x21]\n"
+    "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+    "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x7, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x5, x23, LSL #22\n"
+    "mov x22, #0xb\n"
+    "add x21, x4, x3\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x4, x17\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x17, x4, x20, x17\n"
+    ".inst 0xc0040d80  // mova za.d[x8, #0], { z12.d-z15.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040d81  // mova za.d[x8, #1], { z12.d-z15.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x15, x14, [x23], #0x10\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "ldp x13, x11, [x20], #0x10\n"
+    ".inst 0xc0040d83  // mova za.d[x8, #3], { z12.d-z15.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "ldp x10, x9, [x23], #0x10\n"
+    "ldp x28, x27, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+    "and x22, x21, #0x1\n"
+    ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+    "add x21, x21, #0x1\n"
+    "lsr x21, x21, #0x1\n"
+    ".inst 0xc1aaab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+    "sub x16, x16, x21\n"
+    ".inst 0xc1b5ccbc  // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z28.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z29.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "st1b { z30.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z31.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x4, x3\n"
+    "bne 14f\n"
+    "cbz x22, 12f\n"
+    "cmp x22, #0x1\n"
+    "sub x7, x7, x22\n"
+    "beq 11f\n"
+    "cmp x22, #0x2\n"
+    "beq 10f\n"
+    "cmp x22, #0x3\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 4 priming loads
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1sb { z27.s }, p1/Z, [x17]\n"
+    "addvl x20, SP, #12\n"
+    "ld1sb { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z0.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1sb { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1sb { z11.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z28.h, z28.h, z11.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1sb { z29.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z8.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1sb { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z17.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1sb { z31.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z26.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1sb { z20.s }, p1/Z, [x21]\n"
+    "mov z0.d, z20.d\n"
+    "add z0.h, z0.h, z7.h\n"
+    ".inst 0xc1781788  // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17817a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
+    "9:"  // Unpadded: 3 priming loads
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1sb { z29.s }, p1/Z, [x17]\n"
+    "addvl x20, SP, #9\n"
+    "ld1sb { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z17.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1sb { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1sb { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z0.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1sb { z31.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1sb { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z0.h, z0.h, z16.h\n"
+    "add z0.h, z0.h, z7.h\n"
+    "ld1sb { z1.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z16.h\n"
+    "add z1.h, z1.h, z7.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc17217a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+    "ld1sb { z16.s }, p1/Z, [x21]\n"
+    "mov z2.d, z16.d\n"
+    "add z2.h, z2.h, z7.h\n"
+    ".inst 0xc17317c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17817e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
+    "10:"  // Unpadded: 2 priming loads
+    "add x22, x17, %x[ld_in_row]\n"
+    "ld1sb { z26.s }, p1/Z, [x17]\n"
+    "addvl x21, SP, #6\n"
+    "ld1sb { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1sb { z27.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #12\n"
+    "ld1sb { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1sb { z28.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1sb { z29.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z28.h, z28.h, z29.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1sb { z29.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1sb { z19.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z19.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1sb { z30.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1sb { z23.s }, p1/Z, [x22]\n"
+    "trn1 z30.h, z30.h, z23.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z30.h, z30.h, z7.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1721748  // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+    "ld1sb { z22.s }, p1/Z, [x22]\n"
+    "mov z31.d, z22.d\n"
+    ".inst 0xc1731768  // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17b1769  // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
+    ".inst 0xc1731788  // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701789  // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+    "11:"  // Unpadded: 1 priming loads
+    "add x22, x17, %x[ld_in_row]\n"
+    "ld1sb { z29.s }, p1/Z, [x17]\n"
+    "addvl x21, SP, #3\n"
+    "ld1sb { z22.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z22.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1sb { z30.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #9\n"
+    "ld1sb { z25.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z25.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1sb { z31.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1sb { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1sb { z0.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1sb { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z0.h, z0.h, z16.h\n"
+    "add z0.h, z0.h, z7.h\n"
+    "ld1sb { z1.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1sb { z2.s }, p1/Z, [x22]\n"
+    "trn1 z1.h, z1.h, z2.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z1.h, z1.h, z7.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17217a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+    "ld1sb { z24.s }, p1/Z, [x22]\n"
+    "mov z2.d, z24.d\n"
+    ".inst 0xc17317c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+    ".inst 0xa0402a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc17817a9  // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
+    "add z2.h, z2.h, z7.h\n"
+    "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17917c9  // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
+    ".inst 0xc17317e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
+    "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17317e9  // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
+    "12:"  // Unpadded: 0 priming loads
+    "cmp x7, #0x2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "blt 22f\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1sb { z23.s }, p1/Z, [x17]\n"
+    "sub x7, x7, #0x2\n"
+    "ld1sb { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z25.h\n"
+    "sub x16, x16, #0x1\n"
+    "ld1sb { z24.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "lsr x20, x7, #0x1\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1sb { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z30.h\n"
+    "cmp x20, x16\n"
+    "ld1sb { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "csel x26, x20, x16, LT\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1sb { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z22.h\n"
+    "add z25.h, z25.h, z7.h\n"
+    "ld1sb { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1sb { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z22.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1sb { z27.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "and x7, x7, #0x1\n"
+    "ld1sb { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z30.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1sb { z28.s }, p1/Z, [x21]\n"
+    "mov z28.d, z28.d\n"
+    "add z28.h, z28.h, z7.h\n"
+    "sub x16, x16, x26\n"
+    "cbz x26, 21f\n"
+    "13:"  // Unpadded: Main loop
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x25, SP, #6\n"
+    "addvl x24, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa0402b20  // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
+    "add x23, x17, %x[ld_in_row]\n"
+    "addvl x22, SP, #3\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "addvl x21, SP, #9\n"
+    "subs x26, x26, #0x1\n"
+    ".inst 0xc1711709  // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xa0402b08  // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+    ".inst 0xc17816ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
+    "ld1sb { z23.s }, p1/Z, [x17]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
+    ".inst 0xc179170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
+    "ld1sb { z16.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1sb { z24.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc179172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+    "ld1sb { z18.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1sb { z25.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "ld1sb { z8.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z8.h\n"
+    "add z25.h, z25.h, z7.h\n"
+    "ld1sb { z26.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "ld1sb { z28.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z28.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1sb { z27.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    "ld1sb { z28.s }, p1/Z, [x23]\n"
+    "trn1 z27.h, z27.h, z28.h\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "add z27.h, z27.h, z7.h\n"
+    ".inst 0xa0402ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc17216e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    "ld1sb { z20.s }, p1/Z, [x23]\n"
+    "mov z28.d, z20.d\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+    ".inst 0xc1711709  // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    ".inst 0xc1701728  // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "ld1sb { z23.s }, p1/Z, [x17]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "ld1sb { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "ld1sb { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x10, x10, x28\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "ld1sb { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z16.h\n"
+    "add x9, x9, x27\n"
+    "ld1sb { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1sb { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1sb { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z25.h, z25.h, z7.h\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1sb { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1sb { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "ld1sb { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1sb { z16.s }, p1/Z, [x20]\n"
+    "mov z28.d, z16.d\n"
+    "add z28.h, z28.h, z7.h\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "bgt 13b\n"
+    "b 21f\n"
+    "14:"  // Padded
+    "cbz x22, 19f\n"
+    "cmp x22, #0x1\n"
+    "sub x7, x7, x22\n"
+    "beq 18f\n"
+    "cmp x22, #0x2\n"
+    "beq 17f\n"
+    "cmp x22, #0x3\n"
+    "beq 16f\n"
+    "15:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z27.s }, p0/Z, [x17]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z28.s }, p0/Z, [x21]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    "trn1 z28.h, z28.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z29.s }, p0/Z, [x21]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x21]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z30.s }, p0/Z, [x21]\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z17.s }, p0/Z, [x21]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z31.s }, p0/Z, [x21]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x20, SP, #12\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z18.h\n"
+    "trn1 z30.h, z30.h, z17.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1sb { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "mov z0.d, z20.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1711788  // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
+    "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17117a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
+    "16:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z24.s }, p0/Z, [x17]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z25.s }, p0/Z, [x21]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z17.h\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z26.s }, p0/Z, [x21]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x21]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z27.s }, p0/Z, [x21]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z17.s }, p0/Z, [x21]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z28.s }, p0/Z, [x21]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x20, SP, #9\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z18.h\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    "trn1 z28.h, z28.h, z16.h\n"
+    ".inst 0xc1721708  // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
+    "ld1sb { z11.s }, p0/Z, [x21]\n"
+    "add z11.h, p0/M, z11.h, z7.h\n"
+    "mov z29.d, z11.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1731728  // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701748  // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
+    "17:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z25.s }, p0/Z, [x17]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x21, SP, #6\n"
+    "trn1 z27.h, z27.h, z18.h\n"
+    "trn1 z28.h, z28.h, z17.h\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z29.h, z29.h, z16.h\n"
+    ".inst 0xc1711728  // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+    "ld1sb { z1.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #12\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    ".inst 0xc1791748  // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1721729  // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+    "mov z30.d, z1.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    ".inst 0xc1791768  // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701769  // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+    "18:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z25.s }, p0/Z, [x17]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x21, SP, #3\n"
+    "trn1 z27.h, z27.h, z18.h\n"
+    "trn1 z28.h, z28.h, z17.h\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z29.h, z29.h, z16.h\n"
+    ".inst 0xc1731728  // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+    "ld1sb { z0.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #9\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    ".inst 0xc17b1748  // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1721729  // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+    "mov z30.d, z0.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701769  // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+    "19:"  // Padded: 0 priming loads
+    "cmp x7, #0x2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "blt 22f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z19.s }, p0/Z, [x20]\n"
+    "add z19.h, p0/M, z19.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "sub x7, x7, #0x2\n"
+    "sub x16, x16, #0x1\n"
+    "trn1 z25.h, z25.h, z19.h\n"
+    "trn1 z26.h, z26.h, z18.h\n"
+    "lsr x20, x7, #0x1\n"
+    "cmp x20, x16\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    "mov z28.d, z16.d\n"
+    "csel x25, x20, x16, LT\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "and x7, x7, #0x1\n"
+    "sub x16, x16, x25\n"
+    "cbz x25, 21f\n"
+    "20:"  // Padded: Main loop
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x24, SP, #6\n"
+    "addvl x23, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa1402b00  // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    "addvl x22, SP, #3\n"
+    ".inst 0xc1781709  // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+    ".inst 0xa1402ae3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #9\n"
+    "subs x25, x25, #0x1\n"
+    ".inst 0xc17316ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+    "ld1sb { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17b170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+    "ld1sb { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1sb { z1.s }, p0/Z, [x20]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc173172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    "ld1sb { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z30.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "trn1 z24.h, z24.h, z1.h\n"
+    "trn1 z25.h, z25.h, z3.h\n"
+    "trn1 z26.h, z26.h, z30.h\n"
+    ".inst 0xa0402ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "trn1 z27.h, z27.h, z29.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17216e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x0\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17216e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+    "ld1sb { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1731709  // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+    "ld1sb { z24.s }, p0/Z, [x20]\n"
+    "mov z28.d, z20.d\n"
+    "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    ".inst 0xc1711728  // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+    "mov x12, #0x4\n"
+    "add z22.h, p0/M, z22.h, z7.h\n"
+    "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1711729  // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+    "ld1sb { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z31.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "trn1 z23.h, z23.h, z8.h\n"
+    "trn1 z24.h, z24.h, z22.h\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z25.h, z25.h, z28.h\n"
+    "trn1 z26.h, z26.h, z20.h\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "trn1 z27.h, z27.h, z31.h\n"
+    "mov z28.d, z1.d\n"
+    "bgt 20b\n"
+    "21:"  // Main loop tail
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x24, SP, #6\n"
+    "addvl x23, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa0402b08  // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17816e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
+    "add x22, x17, %x[ld_in_row]\n"
+    "addvl x21, SP, #3\n"
+    ".inst 0xc1791709  // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
+    ".inst 0xa1402ae3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+    "addvl x20, SP, #9\n"
+    ".inst 0xc17316ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+    "ld1sb { z29.s }, p0/Z, [x17]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z8.s }, p0/Z, [x22]\n"
+    "add z8.h, p0/M, z8.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17b170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+    "ld1sb { z30.s }, p0/Z, [x22]\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1sb { z20.s }, p0/Z, [x22]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc172172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
+    "trn1 z29.h, z29.h, z8.h\n"
+    "ld1sb { z31.s }, p0/Z, [x22]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z25.s }, p0/Z, [x22]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z0.s }, p0/Z, [x22]\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z17.s }, p0/Z, [x22]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z1.s }, p0/Z, [x22]\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z28.s }, p0/Z, [x22]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "trn1 z30.h, z30.h, z20.h\n"
+    "trn1 z31.h, z31.h, z25.h\n"
+    "trn1 z0.h, z0.h, z17.h\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z1.h, z1.h, z28.h\n"
+    ".inst 0xc17317a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+    "ld1sb { z22.s }, p0/Z, [x22]\n"
+    ".inst 0xc1a6ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+    "add z22.h, p0/M, z22.h, z7.h\n"
+    ".inst 0xc17b17c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1a4aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
+    ".inst 0xc17317a9  // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
+    "mov z2.d, z22.d\n"
+    "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17b17c9  // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
+    ".inst 0xc1aaab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+    ".inst 0xc17917e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1b5ccb8  // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
+    "st1b { z24.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z25.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc17817e9  // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "st1b { z27.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "22:"  // Main loop skip tail
+    "cbz x7, 23f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z24.s }, p0/Z, [x17]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z17.h\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z31.s }, p0/Z, [x20]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z26.h, z26.h, z17.h\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "ld1sb { z0.s }, p0/Z, [x20]\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    "trn1 z28.h, z28.h, z31.h\n"
+    "addvl x21, SP, #6\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    "mov z29.d, z0.d\n"
+    "addvl x20, SP, #12\n"
+    "sub x16, x16, #0x1\n"
+    ".inst 0xc17b1728  // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
+    ".inst 0xa0402aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1721748  // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+    "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1781709  // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    ".inst 0xc1791729  // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    ".inst 0xc171170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    ".inst 0xc179172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    ".inst 0xc1721749  // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
+    "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    ".inst 0xc173174a  // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
+    "add x8, x8, #0x1\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "23:"  // Tail input: End
+    "cbz x16, 25f\n"
+    "24:"  // Right padding loop
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+    "subs x16, x16, #0x1\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    ".inst 0xc1aaab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+    ".inst 0xc1b5ccbc  // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+    "st1b { z28.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z29.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "st1b { z30.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z31.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "bgt 24b\n"
+    "25:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #16\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x6\n"
+    "whilelt p1.s, x6, x5\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #15\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..2e40c75d6b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const uint8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8q_planar_3x3_s1_4rows_dot_za : public PlanarStrategy<uint8_t, uint8_t>
+{
+  using Parent = PlanarStrategy<uint8_t, uint8_t>;
+
+  public:
+  using return_type = uint8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_u8q_planar_3x3_s1_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_u8q_planar_3x3_s1_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..60c3a1e632
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,664 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const uint8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const uint8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const uint8_t *weights;
+    long unsigned int input_cols, output_cols;
+    uint8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x6\n"
+    "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x6\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x17\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x7\n"
+    "addvl SP, SP, #-12\n"
+    "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z21.h, p2/M, z21.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z30.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1b { z10.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z7.h, #0x0\n"
+    "sub z10.h, z10.h, z31.h\n"
+    "incw x22\n"
+    "ld1b { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z16.h, z16.h, z31.h\n"
+    "trn1 z20.h, z7.h, z10.h\n"
+    "ld1b { z11.s }, p2/Z, [x20]\n"
+    "sub z11.h, z11.h, z31.h\n"
+    "mov x20, x22\n"
+    "trn1 z19.h, z10.h, z16.h\n"
+    "ld1b { z24.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "trn1 z26.h, z16.h, z11.h\n"
+    "trn1 z13.h, z11.h, z7.h\n"
+    "ld1b { z11.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z24.h, z24.h, z31.h\n"
+    "sub z11.h, z11.h, z31.h\n"
+    "ld1b { z2.s }, p2/Z, [x20]\n"
+    "sub z2.h, z2.h, z31.h\n"
+    "addvl x21, SP, #12\n"
+    "incw x22\n"
+    "addvl x21, x21, #-4\n"
+    "mov x20, x22\n"
+    "st1h { z20.h }, p2, [x21]\n"
+    "trn1 z22.h, z7.h, z24.h\n"
+    "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z1.h, z24.h, z11.h\n"
+    "ld1b { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z3.h, z11.h, z2.h\n"
+    "ld1b { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
+    "trn1 z25.h, z2.h, z7.h\n"
+    "ld1b { z4.s }, p2/Z, [x20]\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "sub z16.h, z16.h, z31.h\n"
+    "sub z0.h, z0.h, z31.h\n"
+    "addvl x21, x21, #-4\n"
+    "st1h { z22.h }, p2, [x21]\n"
+    "sub z4.h, z4.h, z31.h\n"
+    "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
+    "mov z31.d, z30.d\n"
+    "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z24.h, z7.h, z16.h\n"
+    "trn1 z18.h, z16.h, z0.h\n"
+    "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
+    "addvl x21, x21, #-4\n"
+    "trn1 z0.h, z0.h, z4.h\n"
+    "trn1 z1.h, z4.h, z7.h\n"
+    "st1h { z24.h }, p2, [x21]\n"
+    "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
+    "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+    "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x15, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x17, x23, LSL #22\n"
+    "mov x22, #0x6\n"
+    "add x21, x7, x6\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x7, x14\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x14, x7, x20, x14\n"
+    ".inst 0xc0040bc0  // mova za.d[x8, #0], { z30.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040bc1  // mova za.d[x8, #1], { z30.d-z31.d }\n"
+    "mov x22, #0x2\n"
+    "ldp x11, x10, [x23], #0x10\n"
+    ".inst 0xc0040bc2  // mova za.d[x8, #2], { z30.d-z31.d }\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    ".inst 0xc0040bc3  // mova za.d[x8, #3], { z30.d-z31.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    "ldp x27, x26, [x23], #0x10\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "sub x13, x13, x21\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x7, x6\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1b { z20.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #8\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z20.h, z16.h\n"
+    "add z4.h, z4.h, z21.h\n"
+    "ld1b { z23.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z5.h, z23.h, z22.h\n"
+    "add z5.h, z5.h, z21.h\n"
+    "ld1b { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "trn1 z6.h, z17.h, z16.h\n"
+    "add z6.h, z6.h, z21.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16b1488  // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
+    ".inst 0xc1631489  // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
+    ".inst 0xa1412a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16814a8  // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
+    ".inst 0xc16014a9  // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x22, x14, %x[ld_in_row]\n"
+    "ld1b { z25.s }, p1/Z, [x14]\n"
+    "addvl x21, SP, #4\n"
+    "ld1b { z6.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z3.h, z25.h, z6.h\n"
+    "add z3.h, z3.h, z21.h\n"
+    "ld1b { z18.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #8\n"
+    "ld1b { z26.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z4.h, z18.h, z26.h\n"
+    "add z4.h, z4.h, z21.h\n"
+    "ld1b { z2.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z5.s }, p1/Z, [x22]\n"
+    "trn1 z5.h, z2.h, z5.h\n"
+    "add z5.h, z5.h, z21.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1611468  // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
+    ".inst 0xc1601469  // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xa0412aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a146a  // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
+    ".inst 0xc162146b  // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
+    ".inst 0xc1691488  // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
+    ".inst 0xc1681489  // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
+    ".inst 0xa1412a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a148a  // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
+    ".inst 0xc162148b  // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
+    "10:"  // Unpadded: 0 priming loads
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "cbz x15, 18f\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "ld1b { z17.s }, p1/Z, [x14]\n"
+    "sub x15, x15, #0x1\n"
+    "ld1b { z9.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z6.h, z17.h, z9.h\n"
+    "sub x13, x13, #0x1\n"
+    "ld1b { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "cmp x15, x13\n"
+    "add z6.h, z6.h, z21.h\n"
+    "ld1b { z7.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z7.h, z17.h, z7.h\n"
+    "csel x23, x15, x13, LT\n"
+    "ld1b { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z7.h, z7.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z1.s }, p1/Z, [x20]\n"
+    "trn1 z8.h, z17.h, z1.h\n"
+    "add z8.h, z8.h, z21.h\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    "addvl x22, SP, #4\n"
+    "addvl x21, SP, #8\n"
+    "ld1b { z2.s }, p1/Z, [x14]\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa1402ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    "ld1b { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa1412ac3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "ld1b { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc16d14ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
+    "ld1b { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16514cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc16914cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+    "ld1b { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16114cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc16b14ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+    "trn1 z6.h, z2.h, z19.h\n"
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc16314eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+    ".inst 0xa1412aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    ".inst 0xc16914ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "add z6.h, z6.h, z21.h\n"
+    ".inst 0xc16114ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
+    "trn1 z7.h, z23.h, z18.h\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "add z7.h, z7.h, z21.h\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "add z8.h, z8.h, z21.h\n"
+    "bgt 11b\n"
+    "b 17f\n"
+    "12:"  // Padded
+    "cbz x22, 15f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 14f\n"
+    "13:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z7.h, z19.h, z18.h\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #8\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    "trn1 z9.h, z17.h, z16.h\n"
+    ".inst 0xc16a14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16214e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
+    ".inst 0xa1412a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16d1508  // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
+    ".inst 0xc1651509  // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
+    "14:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z19.h, z18.h\n"
+    "trn1 z23.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #4\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #8\n"
+    "trn1 z24.h, z17.h, z16.h\n"
+    ".inst 0xc16116c8  // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+    ".inst 0xc16016c9  // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xa0412aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16d16ca  // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
+    ".inst 0xc16516cb  // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
+    ".inst 0xc16116e8  // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
+    ".inst 0xc16016e9  // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+    ".inst 0xa0412a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16116ea  // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
+    ".inst 0xc16016eb  // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
+    "15:"  // Padded: 0 priming loads
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "cbz x15, 18f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z6.h, z19.h, z18.h\n"
+    "trn1 z7.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "sub x15, x15, #0x1\n"
+    "sub x13, x13, #0x1\n"
+    "cmp x15, x13\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    "csel x23, x15, x13, LT\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "16:"  // Padded: Main loop
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z9.s }, p0/Z, [x14]\n"
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    "add z9.h, p0/M, z9.h, z21.h\n"
+    "add x22, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z19.s }, p0/Z, [x22]\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z18.s }, p0/Z, [x22]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x22]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    "mov x12, #0x4\n"
+    "addvl x21, SP, #4\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #8\n"
+    ".inst 0xc16b14ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
+    "subs x23, x23, #0x1\n"
+    "ld1b { z17.s }, p0/Z, [x22]\n"
+    ".inst 0xc16314cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xa0412aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16d14cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    "ld1b { z2.s }, p0/Z, [x22]\n"
+    ".inst 0xc16514cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
+    "add z2.h, p0/M, z2.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16b14ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    ".inst 0xc16a14eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa1412a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc16b14ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z6.h, z9.h, z19.h\n"
+    ".inst 0xc16314ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "trn1 z7.h, z18.h, z16.h\n"
+    "trn1 z8.h, z17.h, z2.h\n"
+    "bgt 16b\n"
+    "17:"  // Main loop tail
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    "addvl x21, SP, #4\n"
+    "addvl x20, SP, #8\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa0412aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc16114ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc16014cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc16914cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc16114cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc16314ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    ".inst 0xc16214eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
+    ".inst 0xa0412a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    ".inst 0xc16114ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
+    ".inst 0xc16014ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "18:"  // Main loop skip tail
+    "cbz x13, 20f\n"
+    "19:"  // Right padding loop
+    ".inst 0xc0060804  // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+    "subs x13, x13, #0x1\n"
+    ".inst 0xc0060826  // mova { z6.d-z7.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc1acaa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc1afab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    ".inst 0xc1bccfa4  // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
+    "st1b { z4.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z6.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z5.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z7.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 19b\n"
+    "20:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x16\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x16, x17\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #12\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..f852e12de1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const uint8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8q_planar_3x3_s2_4rows_dot_za : public PlanarStrategy<uint8_t, uint8_t>
+{
+  using Parent = PlanarStrategy<uint8_t, uint8_t>;
+
+  public:
+  using return_type = uint8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_u8q_planar_3x3_s2_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_u8q_planar_3x3_s2_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..e4ce6c74fb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,881 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const uint8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const uint8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const uint8_t *weights;
+    long unsigned int input_cols, output_cols;
+    uint8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x9\n"
+    "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x6\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x17\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x7\n"
+    "addvl SP, SP, #-6\n"
+    "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z11.h, p2/M, z11.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z28.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1b { z26.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "sub z26.h, z26.h, z16.h\n"
+    "incw x22\n"
+    "mov z24.h, #0x0\n"
+    "ld1b { z3.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z3.h, z3.h, z16.h\n"
+    "trn1 z31.h, z26.h, z3.h\n"
+    "ld1b { z21.s }, p2/Z, [x20]\n"
+    "sub z21.h, z21.h, z16.h\n"
+    "mov x20, x22\n"
+    "trn1 z14.h, z21.h, z24.h\n"
+    "ld1b { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z2.h, z2.h, z16.h\n"
+    "addvl x21, SP, #6\n"
+    "ld1b { z25.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z25.h, z25.h, z16.h\n"
+    "incw x22\n"
+    "ld1b { z27.s }, p2/Z, [x20]\n"
+    "sub z27.h, z27.h, z16.h\n"
+    "addvl x21, x21, #-2\n"
+    "mov x20, x22\n"
+    "st1h { z31.h }, p2, [x21]\n"
+    "trn1 z4.h, z2.h, z25.h\n"
+    "ld1b { z26.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1b { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z12.h, z27.h, z24.h\n"
+    "ld1b { z20.s }, p2/Z, [x20]\n"
+    "sub z26.h, z26.h, z16.h\n"
+    "sub z23.h, z23.h, z16.h\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "sub z20.h, z20.h, z16.h\n"
+    "addvl x21, x21, #-2\n"
+    "st1h { z4.h }, p2, [x21]\n"
+    "mov z29.d, z28.d\n"
+    "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
+    "addvl x21, x21, #-2\n"
+    "mov z30.d, z28.d\n"
+    "mov z31.d, z28.d\n"
+    "trn1 z25.h, z26.h, z23.h\n"
+    "st1h { z25.h }, p2, [x21]\n"
+    "trn1 z3.h, z20.h, z24.h\n"
+    "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x15, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x17, x23, LSL #22\n"
+    "mov x22, #0x9\n"
+    "add x21, x7, x6\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x7, x14\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x14, x7, x20, x14\n"
+    ".inst 0xc0040f80  // mova za.d[x8, #0], { z28.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040f81  // mova za.d[x8, #1], { z28.d-z31.d }\n"
+    "mov x22, #0x2\n"
+    "ldp x11, x10, [x23], #0x10\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    "ldp x27, x26, [x23], #0x10\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+    "and x22, x21, #0x1\n"
+    ".inst 0xc1a9aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
+    "add x21, x21, #0x1\n"
+    "lsr x21, x21, #0x1\n"
+    ".inst 0xc1adab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
+    "sub x13, x13, x21\n"
+    ".inst 0xc1a7cd58  // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z25.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z26.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x7, x6\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1b { z1.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #4\n"
+    "ld1b { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z21.h\n"
+    "add z1.h, z1.h, z11.h\n"
+    "ld1b { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z15.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z2.h, z2.h, z15.h\n"
+    "add z2.h, z2.h, z11.h\n"
+    "ld1b { z3.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z3.h, z3.h, z21.h\n"
+    "add z3.h, z3.h, z11.h\n"
+    "ld1b { z4.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z4.h, z19.h\n"
+    "add z4.h, z4.h, z11.h\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "mov z5.d, z8.d\n"
+    "add z5.h, z5.h, z11.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701428  // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+    ".inst 0xc1781448  // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1b { z1.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #2\n"
+    "ld1b { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z21.h\n"
+    "add z1.h, z1.h, z11.h\n"
+    "ld1b { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z12.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z2.h, z2.h, z12.h\n"
+    "add z2.h, z2.h, z11.h\n"
+    "ld1b { z3.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z3.h, z3.h, z8.h\n"
+    "add z3.h, z3.h, z11.h\n"
+    "ld1b { z4.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z5.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z4.h, z5.h\n"
+    "add z4.h, z4.h, z11.h\n"
+    "ld1b { z5.s }, p1/Z, [x21]\n"
+    "mov z5.d, z5.d\n"
+    "add z5.h, z5.h, z11.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701428  // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+    ".inst 0xc1781448  // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+    "10:"  // Unpadded: 0 priming loads
+    "cmp x15, #0x2\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "blt 18f\n"
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1b { z21.s }, p1/Z, [x14]\n"
+    "sub x15, x15, #0x2\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z8.h\n"
+    "sub x13, x13, #0x1\n"
+    "ld1b { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "lsr x20, x15, #0x1\n"
+    "add z21.h, z21.h, z11.h\n"
+    "ld1b { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z25.h\n"
+    "cmp x20, x13\n"
+    "ld1b { z23.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "csel x23, x20, x13, LT\n"
+    "add z22.h, z22.h, z11.h\n"
+    "ld1b { z18.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z18.h\n"
+    "add z23.h, z23.h, z11.h\n"
+    "ld1b { z24.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z19.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "mov z25.d, z8.d\n"
+    "add z25.h, z25.h, z11.h\n"
+    "and x15, x15, #0x1\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "add x22, x14, %x[ld_in_row]\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "addvl x21, SP, #2\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1b { z21.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "ld1b { z18.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "trn1 z21.h, z21.h, z18.h\n"
+    "ld1b { z22.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z21.h, z21.h, z11.h\n"
+    ".inst 0xc0060c00  // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+    "ld1b { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z8.h\n"
+    "add z22.h, z22.h, z11.h\n"
+    "ld1b { z23.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "ld1b { z27.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z27.h\n"
+    "add z23.h, z23.h, z11.h\n"
+    "ld1b { z24.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc1a6ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+    "ld1b { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z8.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "ld1b { z4.s }, p1/Z, [x22]\n"
+    "mov z25.d, z4.d\n"
+    "add z25.h, z25.h, z11.h\n"
+    ".inst 0xa1402aa4  // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17416a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
+    ".inst 0xc1a9aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+    "ld1b { z21.s }, p1/Z, [x14]\n"
+    ".inst 0xc17c16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
+    ".inst 0xc1adab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+    "ld1b { z12.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z12.h\n"
+    ".inst 0xc1a7cd40  // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+    "ld1b { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "st1b { z0.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "ld1b { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z20.h\n"
+    "st1b { z1.s }, p1, [x10]\n"
+    "ld1b { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x10, x10, x28\n"
+    "st1b { z2.s }, p1, [x27]\n"
+    "ld1b { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z24.h\n"
+    "add x27, x27, x25\n"
+    "ld1b { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "st1b { z3.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "ld1b { z3.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z3.h\n"
+    "add z21.h, z21.h, z11.h\n"
+    "ld1b { z3.s }, p1/Z, [x20]\n"
+    "mov z25.d, z3.d\n"
+    "add z22.h, z22.h, z11.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "add z23.h, z23.h, z11.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "add z25.h, z25.h, z11.h\n"
+    "bgt 11b\n"
+    "b 17f\n"
+    "12:"  // Padded
+    "cbz x22, 15f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 14f\n"
+    "13:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x14]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z4.s }, p0/Z, [x20]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z17.h\n"
+    "trn1 z23.h, z23.h, z4.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z11.h\n"
+    "addvl x20, SP, #4\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    ".inst 0xa1402a84  // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+    "mov z26.d, z1.d\n"
+    ".inst 0xc17416c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17c16e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
+    "14:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x14]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z5.s }, p0/Z, [x20]\n"
+    "add z5.h, p0/M, z5.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z17.h\n"
+    "trn1 z23.h, z23.h, z5.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z15.s }, p0/Z, [x20]\n"
+    "add z15.h, p0/M, z15.h, z11.h\n"
+    "addvl x20, SP, #2\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "mov z26.d, z15.d\n"
+    ".inst 0xc17016c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17116e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
+    "15:"  // Padded: 0 priming loads
+    "cmp x15, #0x2\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "blt 18f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z18.h\n"
+    "trn1 z22.h, z22.h, z3.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z19.s }, p0/Z, [x20]\n"
+    "add z19.h, p0/M, z19.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z20.h, p0/M, z20.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "sub x15, x15, #0x2\n"
+    "sub x13, x13, #0x1\n"
+    "trn1 z23.h, z23.h, z19.h\n"
+    "trn1 z24.h, z24.h, z20.h\n"
+    "lsr x20, x15, #0x1\n"
+    "cmp x20, x13\n"
+    "mov z25.d, z3.d\n"
+    "csel x22, x20, x13, LT\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "and x15, x15, #0x1\n"
+    "sub x13, x13, x22\n"
+    "cbz x22, 17f\n"
+    "16:"  // Padded: Main loop
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa1402a84  // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "add x21, x14, %x[ld_in_row]\n"
+    ".inst 0xc17416a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
+    "ld1b { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z14.s }, p0/Z, [x21]\n"
+    "add z14.h, p0/M, z14.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17c16c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
+    "ld1b { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z15.s }, p0/Z, [x21]\n"
+    "mov x12, #0x4\n"
+    "add z15.h, p0/M, z15.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x21]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x21]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z4.s }, p0/Z, [x21]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "mov x12, #0x8\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z14.h\n"
+    "trn1 z22.h, z22.h, z15.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "addvl x20, SP, #2\n"
+    "ld1b { z2.s }, p0/Z, [x21]\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z4.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "add z2.h, p0/M, z2.h, z11.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17016a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "mov z25.d, z2.d\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17116c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z4.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z12.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z12.h, p0/M, z12.h, z11.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z11.h\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    "subs x22, x22, #0x1\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "trn1 z21.h, z21.h, z20.h\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z22.h, z22.h, z4.h\n"
+    "trn1 z23.h, z23.h, z27.h\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "trn1 z24.h, z24.h, z12.h\n"
+    "mov z25.d, z8.d\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "bgt 16b\n"
+    "17:"  // Main loop tail
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1b { z0.s }, p0/Z, [x14]\n"
+    "add z0.h, p0/M, z0.h, z11.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z14.s }, p0/Z, [x20]\n"
+    "add z14.h, p0/M, z14.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z12.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add z12.h, p0/M, z12.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z2.s }, p0/Z, [x20]\n"
+    "add z2.h, p0/M, z2.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z3.s }, p0/Z, [x20]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #2\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    "trn1 z0.h, z0.h, z14.h\n"
+    "add x8, x8, #0x1\n"
+    "add z27.h, p0/M, z27.h, z11.h\n"
+    "trn1 z1.h, z1.h, z12.h\n"
+    "trn1 z2.h, z2.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "trn1 z3.h, z3.h, z25.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    "mov z4.d, z27.d\n"
+    ".inst 0xc17e1408  // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc17f1428  // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "18:"  // Main loop skip tail
+    "cbz x15, 19f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z0.s }, p0/Z, [x20]\n"
+    "add z0.h, p0/M, z0.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z17.h\n"
+    "trn1 z22.h, z22.h, z0.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z5.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z5.h, p0/M, z5.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z4.s }, p0/Z, [x20]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z5.h\n"
+    "mov z25.d, z4.d\n"
+    "addvl x20, SP, #4\n"
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "sub x13, x13, #0x1\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "19:"  // Tail input: End
+    "cbz x13, 21f\n"
+    "20:"  // Right padding loop
+    ".inst 0xc0060c00  // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a9aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+    "subs x13, x13, #0x1\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    ".inst 0xc1adab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+    ".inst 0xc1a7cd40  // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+    "st1b { z0.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z1.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z2.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z3.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 20b\n"
+    "21:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x16\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x16, x17\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #6\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..d8b87dcd55
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const uint8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8q_planar_5x5_s1_4rows_dot_za : public PlanarStrategy<uint8_t, uint8_t>
+{
+  using Parent = PlanarStrategy<uint8_t, uint8_t>;
+
+  public:
+  using return_type = uint8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_u8q_planar_5x5_s1_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_u8q_planar_5x5_s1_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..d33ef764ef
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,1204 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const uint8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const uint8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const uint8_t *weights;
+    long unsigned int input_cols, output_cols;
+    uint8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x8\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x5\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x7\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x6\n"
+    "addvl SP, SP, #-30\n"
+    "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z17.h, p2/M, z17.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z18.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x23\n"
+    "ld1b { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z15.h, #0x0\n"
+    "sub z2.h, z2.h, z3.h\n"
+    "incw x23\n"
+    "ld1b { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z13.h, z13.h, z3.h\n"
+    "trn1 z11.h, z15.h, z2.h\n"
+    "ld1b { z27.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z27.h, z27.h, z3.h\n"
+    "trn1 z0.h, z2.h, z13.h\n"
+    "ld1b { z19.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z19.h, z19.h, z3.h\n"
+    "trn1 z26.h, z13.h, z27.h\n"
+    "ld1b { z14.s }, p2/Z, [x20]\n"
+    "sub z14.h, z14.h, z3.h\n"
+    "mov x20, x23\n"
+    "trn1 z10.h, z27.h, z19.h\n"
+    "ld1b { z9.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z19.h, z19.h, z14.h\n"
+    "trn1 z1.h, z14.h, z15.h\n"
+    "ld1b { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z9.h, z9.h, z3.h\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "ld1b { z29.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z29.h, z29.h, z3.h\n"
+    "addvl x22, SP, #30\n"
+    "ld1b { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "incw x23\n"
+    "sub z2.h, z2.h, z3.h\n"
+    "ld1b { z23.s }, p2/Z, [x20]\n"
+    "addvl x22, x22, #-6\n"
+    "sub z23.h, z23.h, z3.h\n"
+    "mov x20, x23\n"
+    "st1h { z11.h }, p2, [x22]\n"
+    "trn1 z20.h, z15.h, z9.h\n"
+    "incw x23\n"
+    "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z22.h, z9.h, z5.h\n"
+    "ld1b { z25.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z9.h, z5.h, z29.h\n"
+    "ld1b { z21.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z26.h, z29.h, z2.h\n"
+    "ld1b { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z28.h, z2.h, z23.h\n"
+    "ld1b { z19.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z2.h, z23.h, z15.h\n"
+    "sub z25.h, z25.h, z3.h\n"
+    "addvl x22, x22, #-6\n"
+    "sub z21.h, z21.h, z3.h\n"
+    "ld1b { z6.s }, p2/Z, [x20]\n"
+    "sub z0.h, z0.h, z3.h\n"
+    "mov x20, x23\n"
+    "sub z19.h, z19.h, z3.h\n"
+    "sub z6.h, z6.h, z3.h\n"
+    "st1h { z20.h }, p2, [x22]\n"
+    "incw x23\n"
+    "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z11.h, z15.h, z25.h\n"
+    "trn1 z10.h, z25.h, z21.h\n"
+    "ld1b { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z14.h, z21.h, z0.h\n"
+    "ld1b { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z21.h, z0.h, z19.h\n"
+    "ld1b { z27.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z19.h, z19.h, z6.h\n"
+    "ld1b { z29.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z13.h, z6.h, z15.h\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "sub z23.h, z23.h, z3.h\n"
+    "ld1b { z1.s }, p2/Z, [x20]\n"
+    "addvl x22, x22, #-6\n"
+    "sub z27.h, z27.h, z3.h\n"
+    "sub z29.h, z29.h, z3.h\n"
+    "mov x20, x23\n"
+    "st1h { z11.h }, p2, [x22]\n"
+    "sub z1.h, z1.h, z3.h\n"
+    "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z30.h, z15.h, z5.h\n"
+    "trn1 z26.h, z5.h, z23.h\n"
+    "ld1b { z11.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z22.h, z23.h, z27.h\n"
+    "ld1b { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z28.h, z27.h, z29.h\n"
+    "ld1b { z8.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z27.h, z29.h, z1.h\n"
+    "ld1b { z9.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z2.h, z1.h, z15.h\n"
+    "ld1b { z14.s }, p2/Z, [x20]\n"
+    "sub z11.h, z11.h, z3.h\n"
+    "addvl x22, x22, #-6\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "sub z8.h, z8.h, z3.h\n"
+    "st1h { z30.h }, p2, [x22]\n"
+    "sub z9.h, z9.h, z3.h\n"
+    "sub z14.h, z14.h, z3.h\n"
+    "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
+    "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+    "mov z19.d, z18.d\n"
+    "trn1 z22.h, z15.h, z11.h\n"
+    "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z1.h, z11.h, z5.h\n"
+    "trn1 z31.h, z5.h, z8.h\n"
+    "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z8.h, z8.h, z9.h\n"
+    "trn1 z21.h, z9.h, z14.h\n"
+    "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+    "addvl x22, x22, #-6\n"
+    "trn1 z15.h, z14.h, z15.h\n"
+    "st1h { z22.h }, p2, [x22]\n"
+    "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
+    "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
+    "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
+    "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
+    "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+    "cbz x21, 3f\n"
+    "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x25, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x7, x23, LSL #22\n"
+    "mov x22, #0x8\n"
+    "add x21, x6, x5\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x11, #0x0\n"
+    "mov x8, #0x8\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x6, x16\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x16, x6, x20, x16\n"
+    ".inst 0xc0046a40  // mova za.d[x11, #0], { z18.d-z19.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0046a41  // mova za.d[x11, #1], { z18.d-z19.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x14, x13, [x23], #0x10\n"
+    ".inst 0xc0046a42  // mova za.d[x11, #2], { z18.d-z19.d }\n"
+    "ldp x4, x10, [x20], #0x10\n"
+    ".inst 0xc0046a43  // mova za.d[x11, #3], { z18.d-z19.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0046a44  // mova za.d[x11, #4], { z18.d-z19.d }\n"
+    "ldp x9, x28, [x23], #0x10\n"
+    ".inst 0xc0046a45  // mova za.d[x11, #5], { z18.d-z19.d }\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    ".inst 0xc0046a46  // mova za.d[x11, #6], { z18.d-z19.d }\n"
+    ".inst 0xc0046a47  // mova za.d[x11, #7], { z18.d-z19.d }\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0066814  // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+    "sub x15, x15, x21\n"
+    ".inst 0xc0066836  // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+    ".inst 0xc1a4aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
+    ".inst 0xc1acab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
+    ".inst 0xc1b0cf14  // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z20.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z22.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z21.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z23.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x6, x5\n"
+    "bne 14f\n"
+    "cbz x22, 12f\n"
+    "cmp x22, #0x1\n"
+    "sub x25, x25, x22\n"
+    "beq 11f\n"
+    "cmp x22, #0x2\n"
+    "beq 10f\n"
+    "cmp x22, #0x3\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 4 priming loads
+    "add x21, x16, %x[ld_in_row]\n"
+    "ld1b { z1.s }, p1/Z, [x16]\n"
+    "addvl x20, SP, #24\n"
+    "ld1b { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z1.h, z28.h\n"
+    "add z27.h, z27.h, z17.h\n"
+    "ld1b { z1.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1b { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z28.h, z1.h, z2.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "ld1b { z13.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z6.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z13.h, z6.h\n"
+    "add z29.h, z29.h, z17.h\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16a7768  // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+    "ld1b { z20.s }, p1/Z, [x21]\n"
+    "trn1 z30.h, z30.h, z20.h\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "add z30.h, z30.h, z17.h\n"
+    ".inst 0xc1697788  // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc1617789  // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16b77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
+    ".inst 0xc16a77a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+    "9:"  // Unpadded: 3 priming loads
+    "add x22, x16, %x[ld_in_row]\n"
+    "ld1b { z2.s }, p1/Z, [x16]\n"
+    "addvl x21, SP, #18\n"
+    "ld1b { z28.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z20.h, z2.h, z28.h\n"
+    "add z20.h, z20.h, z17.h\n"
+    "ld1b { z31.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    "ld1b { z11.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z21.h, z31.h, z11.h\n"
+    "add z21.h, z21.h, z17.h\n"
+    "ld1b { z25.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1b { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z22.h, z25.h, z8.h\n"
+    "add z22.h, z22.h, z17.h\n"
+    "ld1b { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16e7688  // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+    "ld1b { z3.s }, p1/Z, [x22]\n"
+    "trn1 z23.h, z8.h, z3.h\n"
+    ".inst 0xc1667689  // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc161768a  // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+    "add z23.h, z23.h, z17.h\n"
+    ".inst 0xa1412aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc160768b  // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+    ".inst 0xc16976a8  // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
+    ".inst 0xa0422aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16176a9  // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16976aa  // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
+    ".inst 0xc16176ab  // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
+    ".inst 0xc16f76c8  // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
+    ".inst 0xc16e76c9  // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16b76ca  // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
+    ".inst 0xc16a76cb  // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
+    "10:"  // Unpadded: 2 priming loads
+    "add x23, x16, %x[ld_in_row]\n"
+    "ld1b { z2.s }, p1/Z, [x16]\n"
+    "addvl x22, SP, #12\n"
+    "ld1b { z22.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z0.h, z2.h, z22.h\n"
+    "add z0.h, z0.h, z17.h\n"
+    "ld1b { z14.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "addvl x21, SP, #18\n"
+    "ld1b { z6.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z1.h, z14.h, z6.h\n"
+    "add z1.h, z1.h, z17.h\n"
+    "ld1b { z15.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    "ld1b { z6.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z2.h, z15.h, z6.h\n"
+    "add z2.h, z2.h, z17.h\n"
+    "ld1b { z21.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16f7408  // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
+    "ld1b { z30.s }, p1/Z, [x23]\n"
+    "trn1 z3.h, z21.h, z30.h\n"
+    ".inst 0xc16e7409  // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
+    ".inst 0xa1402aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16d740a  // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
+    "add z3.h, z3.h, z17.h\n"
+    ".inst 0xa0412ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc165740b  // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
+    ".inst 0xa0402a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16f7428  // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e7429  // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xa0412aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xa0422ac8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16b740c  // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
+    ".inst 0xc16a740d  // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
+    ".inst 0xc16f742a  // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e742b  // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xa0412a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1697448  // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
+    ".inst 0xc1687449  // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
+    ".inst 0xa0422aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f742c  // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e742d  // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xc16b744a  // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
+    ".inst 0xc16a744b  // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
+    ".inst 0xa0422a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc161744c  // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
+    ".inst 0xc160744d  // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
+    "11:"  // Unpadded: 1 priming loads
+    "add x24, x16, %x[ld_in_row]\n"
+    "ld1b { z0.s }, p1/Z, [x16]\n"
+    "addvl x23, SP, #6\n"
+    "ld1b { z3.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z28.h, z0.h, z3.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "ld1b { z6.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x22, SP, #12\n"
+    "ld1b { z30.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z29.h, z6.h, z30.h\n"
+    "add z29.h, z29.h, z17.h\n"
+    "ld1b { z1.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x21, SP, #18\n"
+    "ld1b { z25.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z30.h, z1.h, z25.h\n"
+    "add z30.h, z30.h, z17.h\n"
+    "ld1b { z3.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    ".inst 0xc1617788  // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1b { z5.s }, p1/Z, [x24]\n"
+    "trn1 z31.h, z3.h, z5.h\n"
+    ".inst 0xc1607789  // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16e778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
+    "add z31.h, z31.h, z17.h\n"
+    ".inst 0xa1412ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc166778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
+    ".inst 0xa0402aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16a77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
+    ".inst 0xc16277a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
+    ".inst 0xa0412ac8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xa1422ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16f778c  // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
+    ".inst 0xc16e778d  // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
+    ".inst 0xa1402a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16977aa  // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+    ".inst 0xc16877ab  // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a77c8  // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
+    ".inst 0xc16277c9  // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
+    ".inst 0xa1422ac2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16e778e  // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
+    ".inst 0xc166778f  // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
+    ".inst 0xc16d77ac  // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
+    ".inst 0xc16577ad  // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a77ca  // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
+    ".inst 0xc16277cb  // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
+    ".inst 0xa0422aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e77ae  // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
+    ".inst 0xc16677af  // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
+    ".inst 0xc16977cc  // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
+    ".inst 0xc16877cd  // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
+    ".inst 0xa1422a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16e77ce  // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
+    ".inst 0xc16677cf  // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
+    "12:"  // Unpadded: 0 priming loads
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "cbz x25, 22f\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    "ld1b { z26.s }, p1/Z, [x16]\n"
+    "sub x25, x25, #0x1\n"
+    "ld1b { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z26.h, z28.h\n"
+    "sub x15, x15, #0x1\n"
+    "ld1b { z31.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "cmp x25, x15\n"
+    "add z25.h, z25.h, z17.h\n"
+    "ld1b { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z31.h, z15.h\n"
+    "csel x25, x25, x15, LT\n"
+    "ld1b { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z26.h, z26.h, z17.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1b { z8.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z27.h, z22.h, z8.h\n"
+    "add z27.h, z27.h, z17.h\n"
+    "ld1b { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "sub x15, x15, x25\n"
+    "ld1b { z20.s }, p1/Z, [x20]\n"
+    "trn1 z28.h, z21.h, z20.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "cbz x25, 21f\n"
+    "13:"  // Unpadded: Main loop
+    "addvl x24, SP, #6\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x23, SP, #12\n"
+    "ld1b { z21.s }, p1/Z, [x16]\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402b0e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
+    "addvl x22, SP, #18\n"
+    "addvl x21, SP, #24\n"
+    ".inst 0xc16f772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    "ld1b { z0.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16e772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xa1402ae6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+    "subs x25, x25, #0x1\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    "ld1b { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412b05  // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    "ld1b { z31.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16d774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+    "ld1b { z29.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc165774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    "ld1b { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422b02  // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+    ".inst 0xc16f772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
+    "ld1b { z30.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16e772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xa0402aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16d774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
+    "ld1b { z6.s }, p1/Z, [x20]\n"
+    ".inst 0xc165774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16a776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ac1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc169776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f1728  // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+    ".inst 0xc16e1729  // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+    "trn1 z25.h, z21.h, z0.h\n"
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc16d1748  // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
+    "add z25.h, z25.h, z17.h\n"
+    ".inst 0xc1651749  // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
+    "trn1 z26.h, z20.h, z31.h\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xc16b1768  // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+    "add z26.h, z26.h, z17.h\n"
+    ".inst 0xc16a1769  // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+    "trn1 z27.h, z29.h, z22.h\n"
+    "trn1 z28.h, z30.h, z6.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "add z27.h, z27.h, z17.h\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "add z28.h, z28.h, z17.h\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 13b\n"
+    "b 21f\n"
+    "14:"  // Padded
+    "cbz x22, 19f\n"
+    "cmp x22, #0x1\n"
+    "sub x25, x25, x22\n"
+    "beq 18f\n"
+    "cmp x22, #0x2\n"
+    "beq 17f\n"
+    "cmp x22, #0x3\n"
+    "beq 16f\n"
+    "15:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z9.s }, p0/Z, [x16]\n"
+    "add z9.h, p0/M, z9.h, z17.h\n"
+    "add x21, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x21]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z9.h, z22.h\n"
+    "trn1 z0.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x21]\n"
+    "addvl x20, SP, #24\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    "trn1 z1.h, z22.h, z20.h\n"
+    "ld1b { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16a77e8  // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc16277e9  // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
+    ".inst 0xa1412a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "trn1 z2.h, z21.h, z20.h\n"
+    ".inst 0xc16d7408  // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
+    ".inst 0xa0422a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1657409  // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
+    ".inst 0xc1697428  // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
+    ".inst 0xc1687429  // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
+    "16:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z5.s }, p0/Z, [x16]\n"
+    "add z5.h, p0/M, z5.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z28.h, z5.h, z22.h\n"
+    "trn1 z29.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #18\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    "trn1 z30.h, z22.h, z20.h\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #24\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc1617788  // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xc1607789  // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    "trn1 z31.h, z21.h, z20.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xa0412aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc169778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc161778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xa1422aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
+    ".inst 0xc16e77a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16977aa  // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+    ".inst 0xc16177ab  // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
+    ".inst 0xc16b77c8  // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
+    ".inst 0xc16377c9  // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+    ".inst 0xa0422a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f77ca  // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
+    ".inst 0xc16e77cb  // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
+    "17:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z29.s }, p0/Z, [x16]\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z8.h, z29.h, z22.h\n"
+    "trn1 z9.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "addvl x22, SP, #12\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    "trn1 z10.h, z22.h, z20.h\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #18\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16f7508  // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
+    ".inst 0xc16e7509  // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #24\n"
+    "trn1 z11.h, z21.h, z20.h\n"
+    ".inst 0xa1412ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16e750a  // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc166750b  // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16d7528  // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
+    ".inst 0xa0422ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc1657529  // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16f750c  // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
+    ".inst 0xc16e750d  // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
+    ".inst 0xc16d752a  // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
+    ".inst 0xc165752b  // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1617548  // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
+    ".inst 0xc1607549  // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
+    ".inst 0xa0422aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e752c  // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
+    ".inst 0xc166752d  // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
+    ".inst 0xc161754a  // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
+    ".inst 0xc160754b  // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
+    ".inst 0xa0422a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f754c  // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xc16e754d  // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
+    "18:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z1.s }, p0/Z, [x16]\n"
+    "add z1.h, p0/M, z1.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z1.h, z22.h\n"
+    "trn1 z27.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "addvl x23, SP, #6\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402aee  // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
+    "trn1 z28.h, z22.h, z20.h\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "addvl x22, SP, #12\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16f7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
+    ".inst 0xc16e7749  // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xa0402ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+    "addvl x21, SP, #18\n"
+    "trn1 z29.h, z21.h, z20.h\n"
+    ".inst 0xa0412aea  // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc161774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
+    "addvl x20, SP, #24\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc160774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16b7768  // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xa0422ae8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16a7769  // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa0412aca  // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16e774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc166774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16b776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa1412aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc1697788  // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc1687789  // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
+    ".inst 0xa1422ac2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xc16e776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
+    ".inst 0xc166776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
+    ".inst 0xc162778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
+    ".inst 0xa0422aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
+    ".inst 0xc166776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
+    ".inst 0xc161778c  // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xc160778d  // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1422a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16a778e  // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
+    ".inst 0xc162778f  // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
+    "19:"  // Padded: 0 priming loads
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "cbz x25, 22f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z6.s }, p0/Z, [x16]\n"
+    "add z6.h, p0/M, z6.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z30.s }, p0/Z, [x20]\n"
+    "add z30.h, p0/M, z30.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z6.h, z30.h\n"
+    "trn1 z26.h, z27.h, z26.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z9.s }, p0/Z, [x20]\n"
+    "add z9.h, p0/M, z9.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    "sub x25, x25, #0x1\n"
+    "sub x15, x15, #0x1\n"
+    "cmp x25, x15\n"
+    "trn1 z27.h, z8.h, z9.h\n"
+    "trn1 z28.h, z21.h, z29.h\n"
+    "csel x25, x25, x15, LT\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "sub x15, x15, x25\n"
+    "cbz x25, 21f\n"
+    "20:"  // Padded: Main loop
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z8.s }, p0/Z, [x16]\n"
+    "add z8.h, p0/M, z8.h, z17.h\n"
+    "add x24, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z21.s }, p0/Z, [x24]\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x23, SP, #6\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "addvl x22, SP, #12\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc161772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+    ".inst 0xc160772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    "addvl x21, SP, #18\n"
+    "addvl x20, SP, #24\n"
+    "ld1b { z29.s }, p0/Z, [x24]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    "mov x12, #0x4\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    "ld1b { z30.s }, p0/Z, [x24]\n"
+    "add z30.h, p0/M, z30.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "subs x25, x25, #0x1\n"
+    ".inst 0xc16d774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+    "ld1b { z15.s }, p0/Z, [x24]\n"
+    "add z15.h, p0/M, z15.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc165774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa0412aca  // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    "ld1b { z20.s }, p0/Z, [x24]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ae1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc16e772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+    "ld1b { z31.s }, p0/Z, [x24]\n"
+    "add z31.h, p0/M, z31.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc166772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc16b774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
+    "ld1b { z22.s }, p0/Z, [x24]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    ".inst 0xc16a774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
+    ".inst 0xa1412aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc169776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16e774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc166774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc161776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xc160776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+    ".inst 0xa1422aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc169776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1631728  // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
+    ".inst 0xc1621729  // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    "trn1 z25.h, z8.h, z21.h\n"
+    ".inst 0xc16e1748  // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc1661749  // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "trn1 z26.h, z29.h, z30.h\n"
+    ".inst 0xc16b1768  // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a1769  // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "trn1 z27.h, z15.h, z20.h\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    "trn1 z28.h, z31.h, z22.h\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 20b\n"
+    "21:"  // Main loop tail
+    "addvl x23, SP, #6\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x22, SP, #12\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #18\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xc161772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+    ".inst 0xc160772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc169774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc161774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
+    ".inst 0xa1412ac1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa0422aea  // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16e772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc166772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc169774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc161774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16b776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa0422ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa0412a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc163776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
+    ".inst 0xc162776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422aa2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16a776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa0422a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f1728  // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+    ".inst 0xc16e1729  // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc1691748  // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc1681749  // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
+    ".inst 0xc1611768  // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xc1601769  // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "22:"  // Main loop skip tail
+    "cbz x15, 24f\n"
+    "23:"  // Right padding loop
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    "add x8, x8, #0x2\n"
+    "subs x15, x15, #0x1\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 23b\n"
+    "24:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #16\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x17\n"
+    "whilelt p1.s, x17, x7\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #30\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..05aad19c09
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const uint8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8q_planar_5x5_s2_4rows_dot_za : public PlanarStrategy<uint8_t, uint8_t>
+{
+  using Parent = PlanarStrategy<uint8_t, uint8_t>;
+
+  public:
+  using return_type = uint8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_u8q_planar_5x5_s2_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_u8q_planar_5x5_s2_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..6c144afa77
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,1354 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const uint8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const uint8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const uint8_t *weights;
+    long unsigned int input_cols, output_cols;
+    uint8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 11u - std::min(11u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0xb\n"
+    "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x3\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x5\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x4\n"
+    "addvl SP, SP, #-15\n"
+    "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z7.h, p2/M, z7.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z12.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1b { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "incw x22\n"
+    "mov z26.h, #0x0\n"
+    "ld1b { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "trn1 z17.h, z13.h, z22.h\n"
+    "ld1b { z20.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z20.h, z20.h, z28.h\n"
+    "addvl x21, SP, #15\n"
+    "ld1b { z1.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z1.h, z1.h, z28.h\n"
+    "trn1 z29.h, z20.h, z1.h\n"
+    "ld1b { z27.s }, p2/Z, [x20]\n"
+    "mov x20, x22\n"
+    "sub z27.h, z27.h, z28.h\n"
+    "incw x22\n"
+    "ld1b { z14.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z14.h, z14.h, z28.h\n"
+    "addvl x21, x21, #-3\n"
+    "ld1b { z18.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z18.h, z18.h, z28.h\n"
+    "trn1 z22.h, z27.h, z26.h\n"
+    "ld1b { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z23.h, z23.h, z28.h\n"
+    "st1h { z17.h }, p2, [x21]\n"
+    "ld1b { z30.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z30.h, z30.h, z28.h\n"
+    "trn1 z8.h, z14.h, z18.h\n"
+    "ld1b { z15.s }, p2/Z, [x20]\n"
+    "mov x20, x22\n"
+    "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+    "sub z15.h, z15.h, z28.h\n"
+    "ld1b { z20.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z23.h, z23.h, z30.h\n"
+    "sub z20.h, z20.h, z28.h\n"
+    "ld1b { z24.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z24.h, z24.h, z28.h\n"
+    "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+    "ld1b { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z0.h, z15.h, z26.h\n"
+    "incw x22\n"
+    "ld1b { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z16.h, z16.h, z28.h\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "ld1b { z11.s }, p2/Z, [x20]\n"
+    "addvl x21, x21, #-3\n"
+    "mov x20, x22\n"
+    "st1h { z8.h }, p2, [x21]\n"
+    "trn1 z27.h, z20.h, z24.h\n"
+    "ld1b { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z11.h, z11.h, z28.h\n"
+    "ld1b { z3.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z20.h, z16.h, z13.h\n"
+    "ld1b { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "sub z3.h, z3.h, z28.h\n"
+    "ld1b { z15.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z29.h, z11.h, z26.h\n"
+    "ld1b { z16.s }, p2/Z, [x20]\n"
+    "incw x22\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "sub z15.h, z15.h, z28.h\n"
+    "addvl x21, x21, #-3\n"
+    "mov x20, x22\n"
+    "st1h { z27.h }, p2, [x21]\n"
+    "sub z16.h, z16.h, z28.h\n"
+    "trn1 z19.h, z22.h, z3.h\n"
+    "ld1b { z17.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
+    "ld1b { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z31.h, z13.h, z15.h\n"
+    "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+    "ld1b { z18.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z16.h, z16.h, z26.h\n"
+    "sub z17.h, z17.h, z28.h\n"
+    "ld1b { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z0.h, z0.h, z28.h\n"
+    "sub z18.h, z18.h, z28.h\n"
+    "ld1b { z1.s }, p2/Z, [x20]\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "sub z1.h, z1.h, z28.h\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "addvl x21, x21, #-3\n"
+    "st1h { z19.h }, p2, [x21]\n"
+    "mov z13.d, z12.d\n"
+    "mov z14.d, z12.d\n"
+    "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+    "mov z15.d, z12.d\n"
+    "trn1 z8.h, z17.h, z0.h\n"
+    "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
+    "addvl x21, x21, #-3\n"
+    "trn1 z31.h, z18.h, z22.h\n"
+    "trn1 z29.h, z1.h, z26.h\n"
+    "st1h { z8.h }, p2, [x21]\n"
+    "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+    "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x7, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x5, x23, LSL #22\n"
+    "mov x22, #0xb\n"
+    "add x21, x4, x3\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x4, x17\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x17, x4, x20, x17\n"
+    ".inst 0xc0040d80  // mova za.d[x8, #0], { z12.d-z15.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040d81  // mova za.d[x8, #1], { z12.d-z15.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x15, x14, [x23], #0x10\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "ldp x13, x11, [x20], #0x10\n"
+    ".inst 0xc0040d83  // mova za.d[x8, #3], { z12.d-z15.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "ldp x10, x9, [x23], #0x10\n"
+    "ldp x28, x27, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+    "and x22, x21, #0x1\n"
+    ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+    "add x21, x21, #0x1\n"
+    "lsr x21, x21, #0x1\n"
+    ".inst 0xc1aaab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+    "sub x16, x16, x21\n"
+    ".inst 0xc1b5ccbc  // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z28.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z29.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "st1b { z30.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z31.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x4, x3\n"
+    "bne 14f\n"
+    "cbz x22, 12f\n"
+    "cmp x22, #0x1\n"
+    "sub x7, x7, x22\n"
+    "beq 11f\n"
+    "cmp x22, #0x2\n"
+    "beq 10f\n"
+    "cmp x22, #0x3\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 4 priming loads
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1b { z27.s }, p1/Z, [x17]\n"
+    "addvl x20, SP, #12\n"
+    "ld1b { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z0.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1b { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z11.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z28.h, z28.h, z11.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1b { z29.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z8.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z17.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1b { z31.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z26.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1b { z20.s }, p1/Z, [x21]\n"
+    "mov z0.d, z20.d\n"
+    "add z0.h, z0.h, z7.h\n"
+    ".inst 0xc1781788  // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17817a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
+    "9:"  // Unpadded: 3 priming loads
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1b { z29.s }, p1/Z, [x17]\n"
+    "addvl x20, SP, #9\n"
+    "ld1b { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z17.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z0.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1b { z31.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1b { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z0.h, z0.h, z16.h\n"
+    "add z0.h, z0.h, z7.h\n"
+    "ld1b { z1.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z16.h\n"
+    "add z1.h, z1.h, z7.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc17217a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "mov z2.d, z16.d\n"
+    "add z2.h, z2.h, z7.h\n"
+    ".inst 0xc17317c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17817e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
+    "10:"  // Unpadded: 2 priming loads
+    "add x22, x17, %x[ld_in_row]\n"
+    "ld1b { z26.s }, p1/Z, [x17]\n"
+    "addvl x21, SP, #6\n"
+    "ld1b { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1b { z27.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #12\n"
+    "ld1b { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1b { z28.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z29.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z28.h, z28.h, z29.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1b { z29.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1b { z19.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z19.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1b { z23.s }, p1/Z, [x22]\n"
+    "trn1 z30.h, z30.h, z23.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z30.h, z30.h, z7.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1721748  // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+    "ld1b { z22.s }, p1/Z, [x22]\n"
+    "mov z31.d, z22.d\n"
+    ".inst 0xc1731768  // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17b1769  // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
+    ".inst 0xc1731788  // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701789  // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+    "11:"  // Unpadded: 1 priming loads
+    "add x22, x17, %x[ld_in_row]\n"
+    "ld1b { z29.s }, p1/Z, [x17]\n"
+    "addvl x21, SP, #3\n"
+    "ld1b { z22.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z22.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #9\n"
+    "ld1b { z25.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z25.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1b { z31.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1b { z0.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z0.h, z0.h, z16.h\n"
+    "add z0.h, z0.h, z7.h\n"
+    "ld1b { z1.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1b { z2.s }, p1/Z, [x22]\n"
+    "trn1 z1.h, z1.h, z2.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z1.h, z1.h, z7.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17217a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+    "ld1b { z24.s }, p1/Z, [x22]\n"
+    "mov z2.d, z24.d\n"
+    ".inst 0xc17317c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+    ".inst 0xa0402a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc17817a9  // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
+    "add z2.h, z2.h, z7.h\n"
+    "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17917c9  // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
+    ".inst 0xc17317e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
+    "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17317e9  // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
+    "12:"  // Unpadded: 0 priming loads
+    "cmp x7, #0x2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "blt 22f\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1b { z23.s }, p1/Z, [x17]\n"
+    "sub x7, x7, #0x2\n"
+    "ld1b { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z25.h\n"
+    "sub x16, x16, #0x1\n"
+    "ld1b { z24.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "lsr x20, x7, #0x1\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z30.h\n"
+    "cmp x20, x16\n"
+    "ld1b { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "csel x26, x20, x16, LT\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1b { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z22.h\n"
+    "add z25.h, z25.h, z7.h\n"
+    "ld1b { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z22.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1b { z27.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "and x7, x7, #0x1\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z30.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1b { z28.s }, p1/Z, [x21]\n"
+    "mov z28.d, z28.d\n"
+    "add z28.h, z28.h, z7.h\n"
+    "sub x16, x16, x26\n"
+    "cbz x26, 21f\n"
+    "13:"  // Unpadded: Main loop
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x25, SP, #6\n"
+    "addvl x24, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa0402b20  // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
+    "add x23, x17, %x[ld_in_row]\n"
+    "addvl x22, SP, #3\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "addvl x21, SP, #9\n"
+    "subs x26, x26, #0x1\n"
+    ".inst 0xc1711709  // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xa0402b08  // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+    ".inst 0xc17816ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
+    "ld1b { z23.s }, p1/Z, [x17]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
+    ".inst 0xc179170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
+    "ld1b { z16.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1b { z24.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc179172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+    "ld1b { z18.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1b { z25.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "ld1b { z8.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z8.h\n"
+    "add z25.h, z25.h, z7.h\n"
+    "ld1b { z26.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "ld1b { z28.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z28.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1b { z27.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    "ld1b { z28.s }, p1/Z, [x23]\n"
+    "trn1 z27.h, z27.h, z28.h\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "add z27.h, z27.h, z7.h\n"
+    ".inst 0xa0402ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc17216e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    "ld1b { z20.s }, p1/Z, [x23]\n"
+    "mov z28.d, z20.d\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+    ".inst 0xc1711709  // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    ".inst 0xc1701728  // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "ld1b { z23.s }, p1/Z, [x17]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "ld1b { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x10, x10, x28\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z16.h\n"
+    "add x9, x9, x27\n"
+    "ld1b { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1b { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z25.h, z25.h, z7.h\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1b { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "mov z28.d, z16.d\n"
+    "add z28.h, z28.h, z7.h\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "bgt 13b\n"
+    "b 21f\n"
+    "14:"  // Padded
+    "cbz x22, 19f\n"
+    "cmp x22, #0x1\n"
+    "sub x7, x7, x22\n"
+    "beq 18f\n"
+    "cmp x22, #0x2\n"
+    "beq 17f\n"
+    "cmp x22, #0x3\n"
+    "beq 16f\n"
+    "15:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x17]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z28.s }, p0/Z, [x21]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    "trn1 z28.h, z28.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z29.s }, p0/Z, [x21]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x21]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z30.s }, p0/Z, [x21]\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z31.s }, p0/Z, [x21]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x20, SP, #12\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z18.h\n"
+    "trn1 z30.h, z30.h, z17.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1b { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "mov z0.d, z20.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1711788  // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
+    "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17117a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
+    "16:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z24.s }, p0/Z, [x17]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z25.s }, p0/Z, [x21]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z17.h\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z26.s }, p0/Z, [x21]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x21]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z27.s }, p0/Z, [x21]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z28.s }, p0/Z, [x21]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x20, SP, #9\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z18.h\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    "trn1 z28.h, z28.h, z16.h\n"
+    ".inst 0xc1721708  // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
+    "ld1b { z11.s }, p0/Z, [x21]\n"
+    "add z11.h, p0/M, z11.h, z7.h\n"
+    "mov z29.d, z11.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1731728  // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701748  // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
+    "17:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z25.s }, p0/Z, [x17]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x21, SP, #6\n"
+    "trn1 z27.h, z27.h, z18.h\n"
+    "trn1 z28.h, z28.h, z17.h\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z29.h, z29.h, z16.h\n"
+    ".inst 0xc1711728  // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #12\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    ".inst 0xc1791748  // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1721729  // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+    "mov z30.d, z1.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    ".inst 0xc1791768  // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701769  // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+    "18:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z25.s }, p0/Z, [x17]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x21, SP, #3\n"
+    "trn1 z27.h, z27.h, z18.h\n"
+    "trn1 z28.h, z28.h, z17.h\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z29.h, z29.h, z16.h\n"
+    ".inst 0xc1731728  // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+    "ld1b { z0.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #9\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    ".inst 0xc17b1748  // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1721729  // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+    "mov z30.d, z0.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701769  // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+    "19:"  // Padded: 0 priming loads
+    "cmp x7, #0x2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "blt 22f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z19.s }, p0/Z, [x20]\n"
+    "add z19.h, p0/M, z19.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "sub x7, x7, #0x2\n"
+    "sub x16, x16, #0x1\n"
+    "trn1 z25.h, z25.h, z19.h\n"
+    "trn1 z26.h, z26.h, z18.h\n"
+    "lsr x20, x7, #0x1\n"
+    "cmp x20, x16\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    "mov z28.d, z16.d\n"
+    "csel x25, x20, x16, LT\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "and x7, x7, #0x1\n"
+    "sub x16, x16, x25\n"
+    "cbz x25, 21f\n"
+    "20:"  // Padded: Main loop
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x24, SP, #6\n"
+    "addvl x23, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa1402b00  // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    "addvl x22, SP, #3\n"
+    ".inst 0xc1781709  // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+    ".inst 0xa1402ae3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #9\n"
+    "subs x25, x25, #0x1\n"
+    ".inst 0xc17316ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+    "ld1b { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17b170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc173172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z30.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "trn1 z24.h, z24.h, z1.h\n"
+    "trn1 z25.h, z25.h, z3.h\n"
+    "trn1 z26.h, z26.h, z30.h\n"
+    ".inst 0xa0402ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "trn1 z27.h, z27.h, z29.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17216e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x0\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17216e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+    "ld1b { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1731709  // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "mov z28.d, z20.d\n"
+    "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    ".inst 0xc1711728  // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+    "mov x12, #0x4\n"
+    "add z22.h, p0/M, z22.h, z7.h\n"
+    "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1711729  // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z31.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "trn1 z23.h, z23.h, z8.h\n"
+    "trn1 z24.h, z24.h, z22.h\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z25.h, z25.h, z28.h\n"
+    "trn1 z26.h, z26.h, z20.h\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "trn1 z27.h, z27.h, z31.h\n"
+    "mov z28.d, z1.d\n"
+    "bgt 20b\n"
+    "21:"  // Main loop tail
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x24, SP, #6\n"
+    "addvl x23, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa0402b08  // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17816e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
+    "add x22, x17, %x[ld_in_row]\n"
+    "addvl x21, SP, #3\n"
+    ".inst 0xc1791709  // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
+    ".inst 0xa1402ae3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+    "addvl x20, SP, #9\n"
+    ".inst 0xc17316ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+    "ld1b { z29.s }, p0/Z, [x17]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z8.s }, p0/Z, [x22]\n"
+    "add z8.h, p0/M, z8.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17b170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+    "ld1b { z30.s }, p0/Z, [x22]\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1b { z20.s }, p0/Z, [x22]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc172172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
+    "trn1 z29.h, z29.h, z8.h\n"
+    "ld1b { z31.s }, p0/Z, [x22]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z25.s }, p0/Z, [x22]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z0.s }, p0/Z, [x22]\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x22]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z1.s }, p0/Z, [x22]\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z28.s }, p0/Z, [x22]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "trn1 z30.h, z30.h, z20.h\n"
+    "trn1 z31.h, z31.h, z25.h\n"
+    "trn1 z0.h, z0.h, z17.h\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z1.h, z1.h, z28.h\n"
+    ".inst 0xc17317a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+    "ld1b { z22.s }, p0/Z, [x22]\n"
+    ".inst 0xc1a6ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+    "add z22.h, p0/M, z22.h, z7.h\n"
+    ".inst 0xc17b17c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1a4aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
+    ".inst 0xc17317a9  // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
+    "mov z2.d, z22.d\n"
+    "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17b17c9  // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
+    ".inst 0xc1aaab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+    ".inst 0xc17917e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1b5ccb8  // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
+    "st1b { z24.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z25.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc17817e9  // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "st1b { z27.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "22:"  // Main loop skip tail
+    "cbz x7, 23f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z24.s }, p0/Z, [x17]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z17.h\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z31.s }, p0/Z, [x20]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z26.h, z26.h, z17.h\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "ld1b { z0.s }, p0/Z, [x20]\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    "trn1 z28.h, z28.h, z31.h\n"
+    "addvl x21, SP, #6\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    "mov z29.d, z0.d\n"
+    "addvl x20, SP, #12\n"
+    "sub x16, x16, #0x1\n"
+    ".inst 0xc17b1728  // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
+    ".inst 0xa0402aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1721748  // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+    "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1781709  // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    ".inst 0xc1791729  // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    ".inst 0xc171170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    ".inst 0xc179172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    ".inst 0xc1721749  // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
+    "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    ".inst 0xc173174a  // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
+    "add x8, x8, #0x1\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "23:"  // Tail input: End
+    "cbz x16, 25f\n"
+    "24:"  // Right padding loop
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+    "subs x16, x16, #0x1\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    ".inst 0xc1aaab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+    ".inst 0xc1b5ccbc  // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+    "st1b { z28.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z29.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "st1b { z30.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z31.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "bgt 24b\n"
+    "25:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #16\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x6\n"
+    "whilelt p1.s, x6, x5\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #15\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..a4345097b5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za : public PlanarStrategy<uint8_t, int8_t>
+{
+  using Parent = PlanarStrategy<uint8_t, int8_t>;
+
+  public:
+  using return_type = uint8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..612beb342a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,664 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const uint8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const int8_t *weights;
+    long unsigned int input_cols, output_cols;
+    uint8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x6\n"
+    "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x6\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x17\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x7\n"
+    "addvl SP, SP, #-12\n"
+    "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z21.h, p2/M, z21.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z30.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1sb { z10.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z7.h, #0x0\n"
+    "sub z10.h, z10.h, z31.h\n"
+    "incw x22\n"
+    "ld1sb { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z16.h, z16.h, z31.h\n"
+    "trn1 z20.h, z7.h, z10.h\n"
+    "ld1sb { z11.s }, p2/Z, [x20]\n"
+    "sub z11.h, z11.h, z31.h\n"
+    "mov x20, x22\n"
+    "trn1 z19.h, z10.h, z16.h\n"
+    "ld1sb { z24.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "trn1 z26.h, z16.h, z11.h\n"
+    "trn1 z13.h, z11.h, z7.h\n"
+    "ld1sb { z11.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z24.h, z24.h, z31.h\n"
+    "sub z11.h, z11.h, z31.h\n"
+    "ld1sb { z2.s }, p2/Z, [x20]\n"
+    "sub z2.h, z2.h, z31.h\n"
+    "addvl x21, SP, #12\n"
+    "incw x22\n"
+    "addvl x21, x21, #-4\n"
+    "mov x20, x22\n"
+    "st1h { z20.h }, p2, [x21]\n"
+    "trn1 z22.h, z7.h, z24.h\n"
+    "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z1.h, z24.h, z11.h\n"
+    "ld1sb { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z3.h, z11.h, z2.h\n"
+    "ld1sb { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
+    "trn1 z25.h, z2.h, z7.h\n"
+    "ld1sb { z4.s }, p2/Z, [x20]\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "sub z16.h, z16.h, z31.h\n"
+    "sub z0.h, z0.h, z31.h\n"
+    "addvl x21, x21, #-4\n"
+    "st1h { z22.h }, p2, [x21]\n"
+    "sub z4.h, z4.h, z31.h\n"
+    "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
+    "mov z31.d, z30.d\n"
+    "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z24.h, z7.h, z16.h\n"
+    "trn1 z18.h, z16.h, z0.h\n"
+    "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
+    "addvl x21, x21, #-4\n"
+    "trn1 z0.h, z0.h, z4.h\n"
+    "trn1 z1.h, z4.h, z7.h\n"
+    "st1h { z24.h }, p2, [x21]\n"
+    "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
+    "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+    "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x15, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x17, x23, LSL #22\n"
+    "mov x22, #0x6\n"
+    "add x21, x7, x6\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x7, x14\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x14, x7, x20, x14\n"
+    ".inst 0xc0040bc0  // mova za.d[x8, #0], { z30.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040bc1  // mova za.d[x8, #1], { z30.d-z31.d }\n"
+    "mov x22, #0x2\n"
+    "ldp x11, x10, [x23], #0x10\n"
+    ".inst 0xc0040bc2  // mova za.d[x8, #2], { z30.d-z31.d }\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    ".inst 0xc0040bc3  // mova za.d[x8, #3], { z30.d-z31.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    "ldp x27, x26, [x23], #0x10\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "sub x13, x13, x21\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x7, x6\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1b { z20.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #8\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z20.h, z16.h\n"
+    "add z4.h, z4.h, z21.h\n"
+    "ld1b { z23.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z5.h, z23.h, z22.h\n"
+    "add z5.h, z5.h, z21.h\n"
+    "ld1b { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "trn1 z6.h, z17.h, z16.h\n"
+    "add z6.h, z6.h, z21.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16b1488  // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
+    ".inst 0xc1631489  // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
+    ".inst 0xa1412a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16814a8  // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
+    ".inst 0xc16014a9  // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x22, x14, %x[ld_in_row]\n"
+    "ld1b { z25.s }, p1/Z, [x14]\n"
+    "addvl x21, SP, #4\n"
+    "ld1b { z6.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z3.h, z25.h, z6.h\n"
+    "add z3.h, z3.h, z21.h\n"
+    "ld1b { z18.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #8\n"
+    "ld1b { z26.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z4.h, z18.h, z26.h\n"
+    "add z4.h, z4.h, z21.h\n"
+    "ld1b { z2.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z5.s }, p1/Z, [x22]\n"
+    "trn1 z5.h, z2.h, z5.h\n"
+    "add z5.h, z5.h, z21.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1611468  // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
+    ".inst 0xc1601469  // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xa0412aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a146a  // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
+    ".inst 0xc162146b  // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
+    ".inst 0xc1691488  // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
+    ".inst 0xc1681489  // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
+    ".inst 0xa1412a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a148a  // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
+    ".inst 0xc162148b  // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
+    "10:"  // Unpadded: 0 priming loads
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "cbz x15, 18f\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "ld1b { z17.s }, p1/Z, [x14]\n"
+    "sub x15, x15, #0x1\n"
+    "ld1b { z9.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z6.h, z17.h, z9.h\n"
+    "sub x13, x13, #0x1\n"
+    "ld1b { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "cmp x15, x13\n"
+    "add z6.h, z6.h, z21.h\n"
+    "ld1b { z7.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z7.h, z17.h, z7.h\n"
+    "csel x23, x15, x13, LT\n"
+    "ld1b { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z7.h, z7.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z1.s }, p1/Z, [x20]\n"
+    "trn1 z8.h, z17.h, z1.h\n"
+    "add z8.h, z8.h, z21.h\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    "addvl x22, SP, #4\n"
+    "addvl x21, SP, #8\n"
+    "ld1b { z2.s }, p1/Z, [x14]\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa1402ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    "ld1b { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa1412ac3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "ld1b { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc16d14ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
+    "ld1b { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16514cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc16914cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+    "ld1b { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16114cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc16b14ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+    "trn1 z6.h, z2.h, z19.h\n"
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc16314eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+    ".inst 0xa1412aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    ".inst 0xc16914ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "add z6.h, z6.h, z21.h\n"
+    ".inst 0xc16114ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
+    "trn1 z7.h, z23.h, z18.h\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "add z7.h, z7.h, z21.h\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "add z8.h, z8.h, z21.h\n"
+    "bgt 11b\n"
+    "b 17f\n"
+    "12:"  // Padded
+    "cbz x22, 15f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 14f\n"
+    "13:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z7.h, z19.h, z18.h\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #8\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    "trn1 z9.h, z17.h, z16.h\n"
+    ".inst 0xc16a14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16214e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
+    ".inst 0xa1412a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16d1508  // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
+    ".inst 0xc1651509  // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
+    "14:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z19.h, z18.h\n"
+    "trn1 z23.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #4\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #8\n"
+    "trn1 z24.h, z17.h, z16.h\n"
+    ".inst 0xc16116c8  // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+    ".inst 0xc16016c9  // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xa0412aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16d16ca  // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
+    ".inst 0xc16516cb  // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
+    ".inst 0xc16116e8  // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
+    ".inst 0xc16016e9  // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+    ".inst 0xa0412a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16116ea  // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
+    ".inst 0xc16016eb  // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
+    "15:"  // Padded: 0 priming loads
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "cbz x15, 18f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z6.h, z19.h, z18.h\n"
+    "trn1 z7.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "sub x15, x15, #0x1\n"
+    "sub x13, x13, #0x1\n"
+    "cmp x15, x13\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    "csel x23, x15, x13, LT\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "16:"  // Padded: Main loop
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z9.s }, p0/Z, [x14]\n"
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    "add z9.h, p0/M, z9.h, z21.h\n"
+    "add x22, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z19.s }, p0/Z, [x22]\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z18.s }, p0/Z, [x22]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x22]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    "mov x12, #0x4\n"
+    "addvl x21, SP, #4\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #8\n"
+    ".inst 0xc16b14ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
+    "subs x23, x23, #0x1\n"
+    "ld1b { z17.s }, p0/Z, [x22]\n"
+    ".inst 0xc16314cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xa0412aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16d14cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    "ld1b { z2.s }, p0/Z, [x22]\n"
+    ".inst 0xc16514cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
+    "add z2.h, p0/M, z2.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16b14ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    ".inst 0xc16a14eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa1412a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc16b14ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z6.h, z9.h, z19.h\n"
+    ".inst 0xc16314ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "trn1 z7.h, z18.h, z16.h\n"
+    "trn1 z8.h, z17.h, z2.h\n"
+    "bgt 16b\n"
+    "17:"  // Main loop tail
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    "addvl x21, SP, #4\n"
+    "addvl x20, SP, #8\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa0412aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc16114ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc16014cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc16914cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc16114cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc16314ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    ".inst 0xc16214eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
+    ".inst 0xa0412a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    ".inst 0xc16114ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
+    ".inst 0xc16014ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "18:"  // Main loop skip tail
+    "cbz x13, 20f\n"
+    "19:"  // Right padding loop
+    ".inst 0xc0060804  // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+    "subs x13, x13, #0x1\n"
+    ".inst 0xc0060826  // mova { z6.d-z7.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc1acaa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc1afab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    ".inst 0xc1bccfa4  // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
+    "st1b { z4.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z6.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z5.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z7.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 19b\n"
+    "20:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x16\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x16, x17\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #12\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..104c11fc9d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za : public PlanarStrategy<uint8_t, int8_t>
+{
+  using Parent = PlanarStrategy<uint8_t, int8_t>;
+
+  public:
+  using return_type = uint8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..8ce04fb8c2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,881 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const uint8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const int8_t *weights;
+    long unsigned int input_cols, output_cols;
+    uint8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x9\n"
+    "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x6\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x17\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x7\n"
+    "addvl SP, SP, #-6\n"
+    "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z11.h, p2/M, z11.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z28.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1sb { z26.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "sub z26.h, z26.h, z16.h\n"
+    "incw x22\n"
+    "mov z24.h, #0x0\n"
+    "ld1sb { z3.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z3.h, z3.h, z16.h\n"
+    "trn1 z31.h, z26.h, z3.h\n"
+    "ld1sb { z21.s }, p2/Z, [x20]\n"
+    "sub z21.h, z21.h, z16.h\n"
+    "mov x20, x22\n"
+    "trn1 z14.h, z21.h, z24.h\n"
+    "ld1sb { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z2.h, z2.h, z16.h\n"
+    "addvl x21, SP, #6\n"
+    "ld1sb { z25.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z25.h, z25.h, z16.h\n"
+    "incw x22\n"
+    "ld1sb { z27.s }, p2/Z, [x20]\n"
+    "sub z27.h, z27.h, z16.h\n"
+    "addvl x21, x21, #-2\n"
+    "mov x20, x22\n"
+    "st1h { z31.h }, p2, [x21]\n"
+    "trn1 z4.h, z2.h, z25.h\n"
+    "ld1sb { z26.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1sb { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z12.h, z27.h, z24.h\n"
+    "ld1sb { z20.s }, p2/Z, [x20]\n"
+    "sub z26.h, z26.h, z16.h\n"
+    "sub z23.h, z23.h, z16.h\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "sub z20.h, z20.h, z16.h\n"
+    "addvl x21, x21, #-2\n"
+    "st1h { z4.h }, p2, [x21]\n"
+    "mov z29.d, z28.d\n"
+    "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
+    "addvl x21, x21, #-2\n"
+    "mov z30.d, z28.d\n"
+    "mov z31.d, z28.d\n"
+    "trn1 z25.h, z26.h, z23.h\n"
+    "st1h { z25.h }, p2, [x21]\n"
+    "trn1 z3.h, z20.h, z24.h\n"
+    "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x15, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x17, x23, LSL #22\n"
+    "mov x22, #0x9\n"
+    "add x21, x7, x6\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x7, x14\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x14, x7, x20, x14\n"
+    ".inst 0xc0040f80  // mova za.d[x8, #0], { z28.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040f81  // mova za.d[x8, #1], { z28.d-z31.d }\n"
+    "mov x22, #0x2\n"
+    "ldp x11, x10, [x23], #0x10\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    "ldp x27, x26, [x23], #0x10\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+    "and x22, x21, #0x1\n"
+    ".inst 0xc1a9aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
+    "add x21, x21, #0x1\n"
+    "lsr x21, x21, #0x1\n"
+    ".inst 0xc1adab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
+    "sub x13, x13, x21\n"
+    ".inst 0xc1a7cd58  // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z25.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z26.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x7, x6\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1b { z1.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #4\n"
+    "ld1b { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z21.h\n"
+    "add z1.h, z1.h, z11.h\n"
+    "ld1b { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z15.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z2.h, z2.h, z15.h\n"
+    "add z2.h, z2.h, z11.h\n"
+    "ld1b { z3.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z3.h, z3.h, z21.h\n"
+    "add z3.h, z3.h, z11.h\n"
+    "ld1b { z4.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z4.h, z19.h\n"
+    "add z4.h, z4.h, z11.h\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "mov z5.d, z8.d\n"
+    "add z5.h, z5.h, z11.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701428  // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+    ".inst 0xc1781448  // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1b { z1.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #2\n"
+    "ld1b { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z21.h\n"
+    "add z1.h, z1.h, z11.h\n"
+    "ld1b { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z12.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z2.h, z2.h, z12.h\n"
+    "add z2.h, z2.h, z11.h\n"
+    "ld1b { z3.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z3.h, z3.h, z8.h\n"
+    "add z3.h, z3.h, z11.h\n"
+    "ld1b { z4.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z5.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z4.h, z5.h\n"
+    "add z4.h, z4.h, z11.h\n"
+    "ld1b { z5.s }, p1/Z, [x21]\n"
+    "mov z5.d, z5.d\n"
+    "add z5.h, z5.h, z11.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701428  // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+    ".inst 0xc1781448  // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+    "10:"  // Unpadded: 0 priming loads
+    "cmp x15, #0x2\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "blt 18f\n"
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1b { z21.s }, p1/Z, [x14]\n"
+    "sub x15, x15, #0x2\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z8.h\n"
+    "sub x13, x13, #0x1\n"
+    "ld1b { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "lsr x20, x15, #0x1\n"
+    "add z21.h, z21.h, z11.h\n"
+    "ld1b { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z25.h\n"
+    "cmp x20, x13\n"
+    "ld1b { z23.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "csel x23, x20, x13, LT\n"
+    "add z22.h, z22.h, z11.h\n"
+    "ld1b { z18.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z18.h\n"
+    "add z23.h, z23.h, z11.h\n"
+    "ld1b { z24.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z19.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "mov z25.d, z8.d\n"
+    "add z25.h, z25.h, z11.h\n"
+    "and x15, x15, #0x1\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "add x22, x14, %x[ld_in_row]\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "addvl x21, SP, #2\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1b { z21.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "ld1b { z18.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "trn1 z21.h, z21.h, z18.h\n"
+    "ld1b { z22.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z21.h, z21.h, z11.h\n"
+    ".inst 0xc0060c00  // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+    "ld1b { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z8.h\n"
+    "add z22.h, z22.h, z11.h\n"
+    "ld1b { z23.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "ld1b { z27.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z27.h\n"
+    "add z23.h, z23.h, z11.h\n"
+    "ld1b { z24.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc1a6ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+    "ld1b { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z8.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "ld1b { z4.s }, p1/Z, [x22]\n"
+    "mov z25.d, z4.d\n"
+    "add z25.h, z25.h, z11.h\n"
+    ".inst 0xa1402aa4  // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17416a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
+    ".inst 0xc1a9aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+    "ld1b { z21.s }, p1/Z, [x14]\n"
+    ".inst 0xc17c16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
+    ".inst 0xc1adab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+    "ld1b { z12.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z12.h\n"
+    ".inst 0xc1a7cd40  // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+    "ld1b { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "st1b { z0.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "ld1b { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z20.h\n"
+    "st1b { z1.s }, p1, [x10]\n"
+    "ld1b { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x10, x10, x28\n"
+    "st1b { z2.s }, p1, [x27]\n"
+    "ld1b { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z24.h\n"
+    "add x27, x27, x25\n"
+    "ld1b { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "st1b { z3.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "ld1b { z3.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z3.h\n"
+    "add z21.h, z21.h, z11.h\n"
+    "ld1b { z3.s }, p1/Z, [x20]\n"
+    "mov z25.d, z3.d\n"
+    "add z22.h, z22.h, z11.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "add z23.h, z23.h, z11.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "add z25.h, z25.h, z11.h\n"
+    "bgt 11b\n"
+    "b 17f\n"
+    "12:"  // Padded
+    "cbz x22, 15f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 14f\n"
+    "13:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x14]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z4.s }, p0/Z, [x20]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z17.h\n"
+    "trn1 z23.h, z23.h, z4.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z11.h\n"
+    "addvl x20, SP, #4\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    ".inst 0xa1402a84  // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+    "mov z26.d, z1.d\n"
+    ".inst 0xc17416c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17c16e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
+    "14:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x14]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z5.s }, p0/Z, [x20]\n"
+    "add z5.h, p0/M, z5.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z17.h\n"
+    "trn1 z23.h, z23.h, z5.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z15.s }, p0/Z, [x20]\n"
+    "add z15.h, p0/M, z15.h, z11.h\n"
+    "addvl x20, SP, #2\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "mov z26.d, z15.d\n"
+    ".inst 0xc17016c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17116e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
+    "15:"  // Padded: 0 priming loads
+    "cmp x15, #0x2\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "blt 18f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z18.h\n"
+    "trn1 z22.h, z22.h, z3.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z19.s }, p0/Z, [x20]\n"
+    "add z19.h, p0/M, z19.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z20.h, p0/M, z20.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "sub x15, x15, #0x2\n"
+    "sub x13, x13, #0x1\n"
+    "trn1 z23.h, z23.h, z19.h\n"
+    "trn1 z24.h, z24.h, z20.h\n"
+    "lsr x20, x15, #0x1\n"
+    "cmp x20, x13\n"
+    "mov z25.d, z3.d\n"
+    "csel x22, x20, x13, LT\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "and x15, x15, #0x1\n"
+    "sub x13, x13, x22\n"
+    "cbz x22, 17f\n"
+    "16:"  // Padded: Main loop
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa1402a84  // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "add x21, x14, %x[ld_in_row]\n"
+    ".inst 0xc17416a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
+    "ld1b { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z14.s }, p0/Z, [x21]\n"
+    "add z14.h, p0/M, z14.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17c16c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
+    "ld1b { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z15.s }, p0/Z, [x21]\n"
+    "mov x12, #0x4\n"
+    "add z15.h, p0/M, z15.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x21]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x21]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z4.s }, p0/Z, [x21]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "mov x12, #0x8\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z14.h\n"
+    "trn1 z22.h, z22.h, z15.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "addvl x20, SP, #2\n"
+    "ld1b { z2.s }, p0/Z, [x21]\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z4.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "add z2.h, p0/M, z2.h, z11.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17016a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "mov z25.d, z2.d\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17116c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z4.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z12.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z12.h, p0/M, z12.h, z11.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z11.h\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    "subs x22, x22, #0x1\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "trn1 z21.h, z21.h, z20.h\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z22.h, z22.h, z4.h\n"
+    "trn1 z23.h, z23.h, z27.h\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "trn1 z24.h, z24.h, z12.h\n"
+    "mov z25.d, z8.d\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "bgt 16b\n"
+    "17:"  // Main loop tail
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1b { z0.s }, p0/Z, [x14]\n"
+    "add z0.h, p0/M, z0.h, z11.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z14.s }, p0/Z, [x20]\n"
+    "add z14.h, p0/M, z14.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z12.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add z12.h, p0/M, z12.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z2.s }, p0/Z, [x20]\n"
+    "add z2.h, p0/M, z2.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z3.s }, p0/Z, [x20]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #2\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    "trn1 z0.h, z0.h, z14.h\n"
+    "add x8, x8, #0x1\n"
+    "add z27.h, p0/M, z27.h, z11.h\n"
+    "trn1 z1.h, z1.h, z12.h\n"
+    "trn1 z2.h, z2.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "trn1 z3.h, z3.h, z25.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    "mov z4.d, z27.d\n"
+    ".inst 0xc17e1408  // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc17f1428  // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "18:"  // Main loop skip tail
+    "cbz x15, 19f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z0.s }, p0/Z, [x20]\n"
+    "add z0.h, p0/M, z0.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z17.h\n"
+    "trn1 z22.h, z22.h, z0.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z5.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z5.h, p0/M, z5.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z4.s }, p0/Z, [x20]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z5.h\n"
+    "mov z25.d, z4.d\n"
+    "addvl x20, SP, #4\n"
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "sub x13, x13, #0x1\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "19:"  // Tail input: End
+    "cbz x13, 21f\n"
+    "20:"  // Right padding loop
+    ".inst 0xc0060c00  // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a9aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+    "subs x13, x13, #0x1\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    ".inst 0xc1adab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+    ".inst 0xc1a7cd40  // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+    "st1b { z0.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z1.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z2.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z3.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 20b\n"
+    "21:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x16\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x16, x17\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #6\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..52173b8551
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za : public PlanarStrategy<uint8_t, int8_t>
+{
+  using Parent = PlanarStrategy<uint8_t, int8_t>;
+
+  public:
+  using return_type = uint8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..64023eeaff
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,1204 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const uint8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const int8_t *weights;
+    long unsigned int input_cols, output_cols;
+    uint8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x8\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x5\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x7\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x6\n"
+    "addvl SP, SP, #-30\n"
+    "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z17.h, p2/M, z17.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z18.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x23\n"
+    "ld1sb { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z15.h, #0x0\n"
+    "sub z2.h, z2.h, z3.h\n"
+    "incw x23\n"
+    "ld1sb { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z13.h, z13.h, z3.h\n"
+    "trn1 z11.h, z15.h, z2.h\n"
+    "ld1sb { z27.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z27.h, z27.h, z3.h\n"
+    "trn1 z0.h, z2.h, z13.h\n"
+    "ld1sb { z19.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z19.h, z19.h, z3.h\n"
+    "trn1 z26.h, z13.h, z27.h\n"
+    "ld1sb { z14.s }, p2/Z, [x20]\n"
+    "sub z14.h, z14.h, z3.h\n"
+    "mov x20, x23\n"
+    "trn1 z10.h, z27.h, z19.h\n"
+    "ld1sb { z9.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z19.h, z19.h, z14.h\n"
+    "trn1 z1.h, z14.h, z15.h\n"
+    "ld1sb { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z9.h, z9.h, z3.h\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "ld1sb { z29.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z29.h, z29.h, z3.h\n"
+    "addvl x22, SP, #30\n"
+    "ld1sb { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "incw x23\n"
+    "sub z2.h, z2.h, z3.h\n"
+    "ld1sb { z23.s }, p2/Z, [x20]\n"
+    "addvl x22, x22, #-6\n"
+    "sub z23.h, z23.h, z3.h\n"
+    "mov x20, x23\n"
+    "st1h { z11.h }, p2, [x22]\n"
+    "trn1 z20.h, z15.h, z9.h\n"
+    "incw x23\n"
+    "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z22.h, z9.h, z5.h\n"
+    "ld1sb { z25.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z9.h, z5.h, z29.h\n"
+    "ld1sb { z21.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z26.h, z29.h, z2.h\n"
+    "ld1sb { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z28.h, z2.h, z23.h\n"
+    "ld1sb { z19.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z2.h, z23.h, z15.h\n"
+    "sub z25.h, z25.h, z3.h\n"
+    "addvl x22, x22, #-6\n"
+    "sub z21.h, z21.h, z3.h\n"
+    "ld1sb { z6.s }, p2/Z, [x20]\n"
+    "sub z0.h, z0.h, z3.h\n"
+    "mov x20, x23\n"
+    "sub z19.h, z19.h, z3.h\n"
+    "sub z6.h, z6.h, z3.h\n"
+    "st1h { z20.h }, p2, [x22]\n"
+    "incw x23\n"
+    "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z11.h, z15.h, z25.h\n"
+    "trn1 z10.h, z25.h, z21.h\n"
+    "ld1sb { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z14.h, z21.h, z0.h\n"
+    "ld1sb { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z21.h, z0.h, z19.h\n"
+    "ld1sb { z27.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z19.h, z19.h, z6.h\n"
+    "ld1sb { z29.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z13.h, z6.h, z15.h\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "sub z23.h, z23.h, z3.h\n"
+    "ld1sb { z1.s }, p2/Z, [x20]\n"
+    "addvl x22, x22, #-6\n"
+    "sub z27.h, z27.h, z3.h\n"
+    "sub z29.h, z29.h, z3.h\n"
+    "mov x20, x23\n"
+    "st1h { z11.h }, p2, [x22]\n"
+    "sub z1.h, z1.h, z3.h\n"
+    "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z30.h, z15.h, z5.h\n"
+    "trn1 z26.h, z5.h, z23.h\n"
+    "ld1sb { z11.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z22.h, z23.h, z27.h\n"
+    "ld1sb { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z28.h, z27.h, z29.h\n"
+    "ld1sb { z8.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z27.h, z29.h, z1.h\n"
+    "ld1sb { z9.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z2.h, z1.h, z15.h\n"
+    "ld1sb { z14.s }, p2/Z, [x20]\n"
+    "sub z11.h, z11.h, z3.h\n"
+    "addvl x22, x22, #-6\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "sub z8.h, z8.h, z3.h\n"
+    "st1h { z30.h }, p2, [x22]\n"
+    "sub z9.h, z9.h, z3.h\n"
+    "sub z14.h, z14.h, z3.h\n"
+    "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
+    "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+    "mov z19.d, z18.d\n"
+    "trn1 z22.h, z15.h, z11.h\n"
+    "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z1.h, z11.h, z5.h\n"
+    "trn1 z31.h, z5.h, z8.h\n"
+    "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z8.h, z8.h, z9.h\n"
+    "trn1 z21.h, z9.h, z14.h\n"
+    "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+    "addvl x22, x22, #-6\n"
+    "trn1 z15.h, z14.h, z15.h\n"
+    "st1h { z22.h }, p2, [x22]\n"
+    "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
+    "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
+    "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
+    "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
+    "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+    "cbz x21, 3f\n"
+    "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x25, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x7, x23, LSL #22\n"
+    "mov x22, #0x8\n"
+    "add x21, x6, x5\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x11, #0x0\n"
+    "mov x8, #0x8\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x6, x16\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x16, x6, x20, x16\n"
+    ".inst 0xc0046a40  // mova za.d[x11, #0], { z18.d-z19.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0046a41  // mova za.d[x11, #1], { z18.d-z19.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x14, x13, [x23], #0x10\n"
+    ".inst 0xc0046a42  // mova za.d[x11, #2], { z18.d-z19.d }\n"
+    "ldp x4, x10, [x20], #0x10\n"
+    ".inst 0xc0046a43  // mova za.d[x11, #3], { z18.d-z19.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0046a44  // mova za.d[x11, #4], { z18.d-z19.d }\n"
+    "ldp x9, x28, [x23], #0x10\n"
+    ".inst 0xc0046a45  // mova za.d[x11, #5], { z18.d-z19.d }\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    ".inst 0xc0046a46  // mova za.d[x11, #6], { z18.d-z19.d }\n"
+    ".inst 0xc0046a47  // mova za.d[x11, #7], { z18.d-z19.d }\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0066814  // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+    "sub x15, x15, x21\n"
+    ".inst 0xc0066836  // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+    ".inst 0xc1a4aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
+    ".inst 0xc1acab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
+    ".inst 0xc1b0cf14  // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z20.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z22.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z21.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z23.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x6, x5\n"
+    "bne 14f\n"
+    "cbz x22, 12f\n"
+    "cmp x22, #0x1\n"
+    "sub x25, x25, x22\n"
+    "beq 11f\n"
+    "cmp x22, #0x2\n"
+    "beq 10f\n"
+    "cmp x22, #0x3\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 4 priming loads
+    "add x21, x16, %x[ld_in_row]\n"
+    "ld1b { z1.s }, p1/Z, [x16]\n"
+    "addvl x20, SP, #24\n"
+    "ld1b { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z1.h, z28.h\n"
+    "add z27.h, z27.h, z17.h\n"
+    "ld1b { z1.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1b { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z28.h, z1.h, z2.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "ld1b { z13.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z6.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z13.h, z6.h\n"
+    "add z29.h, z29.h, z17.h\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16a7768  // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+    "ld1b { z20.s }, p1/Z, [x21]\n"
+    "trn1 z30.h, z30.h, z20.h\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "add z30.h, z30.h, z17.h\n"
+    ".inst 0xc1697788  // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc1617789  // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16b77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
+    ".inst 0xc16a77a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+    "9:"  // Unpadded: 3 priming loads
+    "add x22, x16, %x[ld_in_row]\n"
+    "ld1b { z2.s }, p1/Z, [x16]\n"
+    "addvl x21, SP, #18\n"
+    "ld1b { z28.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z20.h, z2.h, z28.h\n"
+    "add z20.h, z20.h, z17.h\n"
+    "ld1b { z31.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    "ld1b { z11.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z21.h, z31.h, z11.h\n"
+    "add z21.h, z21.h, z17.h\n"
+    "ld1b { z25.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1b { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z22.h, z25.h, z8.h\n"
+    "add z22.h, z22.h, z17.h\n"
+    "ld1b { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16e7688  // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+    "ld1b { z3.s }, p1/Z, [x22]\n"
+    "trn1 z23.h, z8.h, z3.h\n"
+    ".inst 0xc1667689  // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc161768a  // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+    "add z23.h, z23.h, z17.h\n"
+    ".inst 0xa1412aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc160768b  // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+    ".inst 0xc16976a8  // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
+    ".inst 0xa0422aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16176a9  // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16976aa  // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
+    ".inst 0xc16176ab  // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
+    ".inst 0xc16f76c8  // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
+    ".inst 0xc16e76c9  // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16b76ca  // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
+    ".inst 0xc16a76cb  // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
+    "10:"  // Unpadded: 2 priming loads
+    "add x23, x16, %x[ld_in_row]\n"
+    "ld1b { z2.s }, p1/Z, [x16]\n"
+    "addvl x22, SP, #12\n"
+    "ld1b { z22.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z0.h, z2.h, z22.h\n"
+    "add z0.h, z0.h, z17.h\n"
+    "ld1b { z14.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "addvl x21, SP, #18\n"
+    "ld1b { z6.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z1.h, z14.h, z6.h\n"
+    "add z1.h, z1.h, z17.h\n"
+    "ld1b { z15.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    "ld1b { z6.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z2.h, z15.h, z6.h\n"
+    "add z2.h, z2.h, z17.h\n"
+    "ld1b { z21.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16f7408  // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
+    "ld1b { z30.s }, p1/Z, [x23]\n"
+    "trn1 z3.h, z21.h, z30.h\n"
+    ".inst 0xc16e7409  // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
+    ".inst 0xa1402aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16d740a  // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
+    "add z3.h, z3.h, z17.h\n"
+    ".inst 0xa0412ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc165740b  // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
+    ".inst 0xa0402a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16f7428  // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e7429  // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xa0412aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xa0422ac8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16b740c  // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
+    ".inst 0xc16a740d  // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
+    ".inst 0xc16f742a  // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e742b  // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xa0412a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1697448  // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
+    ".inst 0xc1687449  // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
+    ".inst 0xa0422aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f742c  // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e742d  // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xc16b744a  // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
+    ".inst 0xc16a744b  // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
+    ".inst 0xa0422a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc161744c  // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
+    ".inst 0xc160744d  // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
+    "11:"  // Unpadded: 1 priming loads
+    "add x24, x16, %x[ld_in_row]\n"
+    "ld1b { z0.s }, p1/Z, [x16]\n"
+    "addvl x23, SP, #6\n"
+    "ld1b { z3.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z28.h, z0.h, z3.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "ld1b { z6.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x22, SP, #12\n"
+    "ld1b { z30.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z29.h, z6.h, z30.h\n"
+    "add z29.h, z29.h, z17.h\n"
+    "ld1b { z1.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x21, SP, #18\n"
+    "ld1b { z25.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z30.h, z1.h, z25.h\n"
+    "add z30.h, z30.h, z17.h\n"
+    "ld1b { z3.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    ".inst 0xc1617788  // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1b { z5.s }, p1/Z, [x24]\n"
+    "trn1 z31.h, z3.h, z5.h\n"
+    ".inst 0xc1607789  // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16e778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
+    "add z31.h, z31.h, z17.h\n"
+    ".inst 0xa1412ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc166778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
+    ".inst 0xa0402aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16a77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
+    ".inst 0xc16277a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
+    ".inst 0xa0412ac8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xa1422ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16f778c  // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
+    ".inst 0xc16e778d  // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
+    ".inst 0xa1402a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16977aa  // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+    ".inst 0xc16877ab  // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a77c8  // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
+    ".inst 0xc16277c9  // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
+    ".inst 0xa1422ac2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16e778e  // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
+    ".inst 0xc166778f  // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
+    ".inst 0xc16d77ac  // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
+    ".inst 0xc16577ad  // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a77ca  // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
+    ".inst 0xc16277cb  // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
+    ".inst 0xa0422aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e77ae  // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
+    ".inst 0xc16677af  // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
+    ".inst 0xc16977cc  // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
+    ".inst 0xc16877cd  // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
+    ".inst 0xa1422a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16e77ce  // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
+    ".inst 0xc16677cf  // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
+    "12:"  // Unpadded: 0 priming loads
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "cbz x25, 22f\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    "ld1b { z26.s }, p1/Z, [x16]\n"
+    "sub x25, x25, #0x1\n"
+    "ld1b { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z26.h, z28.h\n"
+    "sub x15, x15, #0x1\n"
+    "ld1b { z31.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "cmp x25, x15\n"
+    "add z25.h, z25.h, z17.h\n"
+    "ld1b { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z31.h, z15.h\n"
+    "csel x25, x25, x15, LT\n"
+    "ld1b { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z26.h, z26.h, z17.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1b { z8.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z27.h, z22.h, z8.h\n"
+    "add z27.h, z27.h, z17.h\n"
+    "ld1b { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "sub x15, x15, x25\n"
+    "ld1b { z20.s }, p1/Z, [x20]\n"
+    "trn1 z28.h, z21.h, z20.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "cbz x25, 21f\n"
+    "13:"  // Unpadded: Main loop
+    "addvl x24, SP, #6\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x23, SP, #12\n"
+    "ld1b { z21.s }, p1/Z, [x16]\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402b0e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
+    "addvl x22, SP, #18\n"
+    "addvl x21, SP, #24\n"
+    ".inst 0xc16f772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    "ld1b { z0.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16e772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xa1402ae6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+    "subs x25, x25, #0x1\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    "ld1b { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412b05  // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    "ld1b { z31.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16d774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+    "ld1b { z29.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc165774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    "ld1b { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422b02  // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+    ".inst 0xc16f772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
+    "ld1b { z30.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16e772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xa0402aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16d774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
+    "ld1b { z6.s }, p1/Z, [x20]\n"
+    ".inst 0xc165774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16a776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ac1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc169776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f1728  // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+    ".inst 0xc16e1729  // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+    "trn1 z25.h, z21.h, z0.h\n"
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc16d1748  // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
+    "add z25.h, z25.h, z17.h\n"
+    ".inst 0xc1651749  // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
+    "trn1 z26.h, z20.h, z31.h\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xc16b1768  // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+    "add z26.h, z26.h, z17.h\n"
+    ".inst 0xc16a1769  // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+    "trn1 z27.h, z29.h, z22.h\n"
+    "trn1 z28.h, z30.h, z6.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "add z27.h, z27.h, z17.h\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "add z28.h, z28.h, z17.h\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 13b\n"
+    "b 21f\n"
+    "14:"  // Padded
+    "cbz x22, 19f\n"
+    "cmp x22, #0x1\n"
+    "sub x25, x25, x22\n"
+    "beq 18f\n"
+    "cmp x22, #0x2\n"
+    "beq 17f\n"
+    "cmp x22, #0x3\n"
+    "beq 16f\n"
+    "15:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z9.s }, p0/Z, [x16]\n"
+    "add z9.h, p0/M, z9.h, z17.h\n"
+    "add x21, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x21]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z9.h, z22.h\n"
+    "trn1 z0.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x21]\n"
+    "addvl x20, SP, #24\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    "trn1 z1.h, z22.h, z20.h\n"
+    "ld1b { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16a77e8  // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc16277e9  // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
+    ".inst 0xa1412a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "trn1 z2.h, z21.h, z20.h\n"
+    ".inst 0xc16d7408  // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
+    ".inst 0xa0422a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1657409  // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
+    ".inst 0xc1697428  // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
+    ".inst 0xc1687429  // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
+    "16:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z5.s }, p0/Z, [x16]\n"
+    "add z5.h, p0/M, z5.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z28.h, z5.h, z22.h\n"
+    "trn1 z29.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #18\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    "trn1 z30.h, z22.h, z20.h\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #24\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc1617788  // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xc1607789  // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    "trn1 z31.h, z21.h, z20.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xa0412aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc169778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc161778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xa1422aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
+    ".inst 0xc16e77a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16977aa  // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+    ".inst 0xc16177ab  // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
+    ".inst 0xc16b77c8  // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
+    ".inst 0xc16377c9  // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+    ".inst 0xa0422a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f77ca  // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
+    ".inst 0xc16e77cb  // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
+    "17:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z29.s }, p0/Z, [x16]\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z8.h, z29.h, z22.h\n"
+    "trn1 z9.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "addvl x22, SP, #12\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    "trn1 z10.h, z22.h, z20.h\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #18\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16f7508  // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
+    ".inst 0xc16e7509  // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #24\n"
+    "trn1 z11.h, z21.h, z20.h\n"
+    ".inst 0xa1412ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16e750a  // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc166750b  // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16d7528  // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
+    ".inst 0xa0422ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc1657529  // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16f750c  // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
+    ".inst 0xc16e750d  // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
+    ".inst 0xc16d752a  // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
+    ".inst 0xc165752b  // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1617548  // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
+    ".inst 0xc1607549  // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
+    ".inst 0xa0422aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e752c  // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
+    ".inst 0xc166752d  // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
+    ".inst 0xc161754a  // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
+    ".inst 0xc160754b  // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
+    ".inst 0xa0422a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f754c  // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xc16e754d  // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
+    "18:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z1.s }, p0/Z, [x16]\n"
+    "add z1.h, p0/M, z1.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z1.h, z22.h\n"
+    "trn1 z27.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "addvl x23, SP, #6\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402aee  // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
+    "trn1 z28.h, z22.h, z20.h\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "addvl x22, SP, #12\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16f7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
+    ".inst 0xc16e7749  // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xa0402ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+    "addvl x21, SP, #18\n"
+    "trn1 z29.h, z21.h, z20.h\n"
+    ".inst 0xa0412aea  // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc161774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
+    "addvl x20, SP, #24\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc160774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16b7768  // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xa0422ae8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16a7769  // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa0412aca  // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16e774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc166774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16b776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa1412aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc1697788  // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc1687789  // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
+    ".inst 0xa1422ac2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xc16e776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
+    ".inst 0xc166776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
+    ".inst 0xc162778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
+    ".inst 0xa0422aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
+    ".inst 0xc166776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
+    ".inst 0xc161778c  // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xc160778d  // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1422a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16a778e  // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
+    ".inst 0xc162778f  // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
+    "19:"  // Padded: 0 priming loads
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "cbz x25, 22f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z6.s }, p0/Z, [x16]\n"
+    "add z6.h, p0/M, z6.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z30.s }, p0/Z, [x20]\n"
+    "add z30.h, p0/M, z30.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z6.h, z30.h\n"
+    "trn1 z26.h, z27.h, z26.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z9.s }, p0/Z, [x20]\n"
+    "add z9.h, p0/M, z9.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    "sub x25, x25, #0x1\n"
+    "sub x15, x15, #0x1\n"
+    "cmp x25, x15\n"
+    "trn1 z27.h, z8.h, z9.h\n"
+    "trn1 z28.h, z21.h, z29.h\n"
+    "csel x25, x25, x15, LT\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "sub x15, x15, x25\n"
+    "cbz x25, 21f\n"
+    "20:"  // Padded: Main loop
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z8.s }, p0/Z, [x16]\n"
+    "add z8.h, p0/M, z8.h, z17.h\n"
+    "add x24, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z21.s }, p0/Z, [x24]\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x23, SP, #6\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "addvl x22, SP, #12\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc161772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+    ".inst 0xc160772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    "addvl x21, SP, #18\n"
+    "addvl x20, SP, #24\n"
+    "ld1b { z29.s }, p0/Z, [x24]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    "mov x12, #0x4\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    "ld1b { z30.s }, p0/Z, [x24]\n"
+    "add z30.h, p0/M, z30.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "subs x25, x25, #0x1\n"
+    ".inst 0xc16d774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+    "ld1b { z15.s }, p0/Z, [x24]\n"
+    "add z15.h, p0/M, z15.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc165774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa0412aca  // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    "ld1b { z20.s }, p0/Z, [x24]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ae1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc16e772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+    "ld1b { z31.s }, p0/Z, [x24]\n"
+    "add z31.h, p0/M, z31.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc166772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc16b774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
+    "ld1b { z22.s }, p0/Z, [x24]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    ".inst 0xc16a774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
+    ".inst 0xa1412aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc169776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16e774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc166774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc161776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xc160776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+    ".inst 0xa1422aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc169776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1631728  // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
+    ".inst 0xc1621729  // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    "trn1 z25.h, z8.h, z21.h\n"
+    ".inst 0xc16e1748  // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc1661749  // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "trn1 z26.h, z29.h, z30.h\n"
+    ".inst 0xc16b1768  // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a1769  // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "trn1 z27.h, z15.h, z20.h\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    "trn1 z28.h, z31.h, z22.h\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 20b\n"
+    "21:"  // Main loop tail
+    "addvl x23, SP, #6\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x22, SP, #12\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #18\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xc161772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+    ".inst 0xc160772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc169774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc161774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
+    ".inst 0xa1412ac1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa0422aea  // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16e772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc166772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc169774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc161774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16b776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa0422ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa0412a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc163776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
+    ".inst 0xc162776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422aa2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16a776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa0422a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f1728  // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+    ".inst 0xc16e1729  // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc1691748  // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc1681749  // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
+    ".inst 0xc1611768  // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xc1601769  // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "22:"  // Main loop skip tail
+    "cbz x15, 24f\n"
+    "23:"  // Right padding loop
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    "add x8, x8, #0x2\n"
+    "subs x15, x15, #0x1\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 23b\n"
+    "24:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #16\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x17\n"
+    "whilelt p1.s, x17, x7\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #30\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..ad82070912
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za : public PlanarStrategy<uint8_t, int8_t>
+{
+  using Parent = PlanarStrategy<uint8_t, int8_t>;
+
+  public:
+  using return_type = uint8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..d8dc69127e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,1354 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const uint8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const int8_t *weights;
+    long unsigned int input_cols, output_cols;
+    uint8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 11u - std::min(11u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0xb\n"
+    "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x3\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x5\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x4\n"
+    "addvl SP, SP, #-15\n"
+    "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z7.h, p2/M, z7.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z12.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1sb { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "incw x22\n"
+    "mov z26.h, #0x0\n"
+    "ld1sb { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "trn1 z17.h, z13.h, z22.h\n"
+    "ld1sb { z20.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z20.h, z20.h, z28.h\n"
+    "addvl x21, SP, #15\n"
+    "ld1sb { z1.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z1.h, z1.h, z28.h\n"
+    "trn1 z29.h, z20.h, z1.h\n"
+    "ld1sb { z27.s }, p2/Z, [x20]\n"
+    "mov x20, x22\n"
+    "sub z27.h, z27.h, z28.h\n"
+    "incw x22\n"
+    "ld1sb { z14.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z14.h, z14.h, z28.h\n"
+    "addvl x21, x21, #-3\n"
+    "ld1sb { z18.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z18.h, z18.h, z28.h\n"
+    "trn1 z22.h, z27.h, z26.h\n"
+    "ld1sb { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z23.h, z23.h, z28.h\n"
+    "st1h { z17.h }, p2, [x21]\n"
+    "ld1sb { z30.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z30.h, z30.h, z28.h\n"
+    "trn1 z8.h, z14.h, z18.h\n"
+    "ld1sb { z15.s }, p2/Z, [x20]\n"
+    "mov x20, x22\n"
+    "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+    "sub z15.h, z15.h, z28.h\n"
+    "ld1sb { z20.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z23.h, z23.h, z30.h\n"
+    "sub z20.h, z20.h, z28.h\n"
+    "ld1sb { z24.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z24.h, z24.h, z28.h\n"
+    "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+    "ld1sb { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z0.h, z15.h, z26.h\n"
+    "incw x22\n"
+    "ld1sb { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z16.h, z16.h, z28.h\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "ld1sb { z11.s }, p2/Z, [x20]\n"
+    "addvl x21, x21, #-3\n"
+    "mov x20, x22\n"
+    "st1h { z8.h }, p2, [x21]\n"
+    "trn1 z27.h, z20.h, z24.h\n"
+    "ld1sb { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z11.h, z11.h, z28.h\n"
+    "ld1sb { z3.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z20.h, z16.h, z13.h\n"
+    "ld1sb { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "sub z3.h, z3.h, z28.h\n"
+    "ld1sb { z15.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z29.h, z11.h, z26.h\n"
+    "ld1sb { z16.s }, p2/Z, [x20]\n"
+    "incw x22\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "sub z15.h, z15.h, z28.h\n"
+    "addvl x21, x21, #-3\n"
+    "mov x20, x22\n"
+    "st1h { z27.h }, p2, [x21]\n"
+    "sub z16.h, z16.h, z28.h\n"
+    "trn1 z19.h, z22.h, z3.h\n"
+    "ld1sb { z17.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
+    "ld1sb { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z31.h, z13.h, z15.h\n"
+    "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+    "ld1sb { z18.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z16.h, z16.h, z26.h\n"
+    "sub z17.h, z17.h, z28.h\n"
+    "ld1sb { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z0.h, z0.h, z28.h\n"
+    "sub z18.h, z18.h, z28.h\n"
+    "ld1sb { z1.s }, p2/Z, [x20]\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "sub z1.h, z1.h, z28.h\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "addvl x21, x21, #-3\n"
+    "st1h { z19.h }, p2, [x21]\n"
+    "mov z13.d, z12.d\n"
+    "mov z14.d, z12.d\n"
+    "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+    "mov z15.d, z12.d\n"
+    "trn1 z8.h, z17.h, z0.h\n"
+    "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
+    "addvl x21, x21, #-3\n"
+    "trn1 z31.h, z18.h, z22.h\n"
+    "trn1 z29.h, z1.h, z26.h\n"
+    "st1h { z8.h }, p2, [x21]\n"
+    "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+    "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x7, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x5, x23, LSL #22\n"
+    "mov x22, #0xb\n"
+    "add x21, x4, x3\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x4, x17\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x17, x4, x20, x17\n"
+    ".inst 0xc0040d80  // mova za.d[x8, #0], { z12.d-z15.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040d81  // mova za.d[x8, #1], { z12.d-z15.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x15, x14, [x23], #0x10\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "ldp x13, x11, [x20], #0x10\n"
+    ".inst 0xc0040d83  // mova za.d[x8, #3], { z12.d-z15.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "ldp x10, x9, [x23], #0x10\n"
+    "ldp x28, x27, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+    "and x22, x21, #0x1\n"
+    ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+    "add x21, x21, #0x1\n"
+    "lsr x21, x21, #0x1\n"
+    ".inst 0xc1aaab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+    "sub x16, x16, x21\n"
+    ".inst 0xc1b5ccbc  // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z28.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z29.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "st1b { z30.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z31.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x4, x3\n"
+    "bne 14f\n"
+    "cbz x22, 12f\n"
+    "cmp x22, #0x1\n"
+    "sub x7, x7, x22\n"
+    "beq 11f\n"
+    "cmp x22, #0x2\n"
+    "beq 10f\n"
+    "cmp x22, #0x3\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 4 priming loads
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1b { z27.s }, p1/Z, [x17]\n"
+    "addvl x20, SP, #12\n"
+    "ld1b { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z0.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1b { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z11.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z28.h, z28.h, z11.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1b { z29.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z8.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z17.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1b { z31.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z26.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1b { z20.s }, p1/Z, [x21]\n"
+    "mov z0.d, z20.d\n"
+    "add z0.h, z0.h, z7.h\n"
+    ".inst 0xc1781788  // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17817a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
+    "9:"  // Unpadded: 3 priming loads
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1b { z29.s }, p1/Z, [x17]\n"
+    "addvl x20, SP, #9\n"
+    "ld1b { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z17.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z0.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1b { z31.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1b { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z0.h, z0.h, z16.h\n"
+    "add z0.h, z0.h, z7.h\n"
+    "ld1b { z1.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z16.h\n"
+    "add z1.h, z1.h, z7.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc17217a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "mov z2.d, z16.d\n"
+    "add z2.h, z2.h, z7.h\n"
+    ".inst 0xc17317c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17817e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
+    "10:"  // Unpadded: 2 priming loads
+    "add x22, x17, %x[ld_in_row]\n"
+    "ld1b { z26.s }, p1/Z, [x17]\n"
+    "addvl x21, SP, #6\n"
+    "ld1b { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1b { z27.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #12\n"
+    "ld1b { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1b { z28.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z29.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z28.h, z28.h, z29.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1b { z29.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1b { z19.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z19.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1b { z23.s }, p1/Z, [x22]\n"
+    "trn1 z30.h, z30.h, z23.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z30.h, z30.h, z7.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1721748  // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+    "ld1b { z22.s }, p1/Z, [x22]\n"
+    "mov z31.d, z22.d\n"
+    ".inst 0xc1731768  // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17b1769  // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
+    ".inst 0xc1731788  // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701789  // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+    "11:"  // Unpadded: 1 priming loads
+    "add x22, x17, %x[ld_in_row]\n"
+    "ld1b { z29.s }, p1/Z, [x17]\n"
+    "addvl x21, SP, #3\n"
+    "ld1b { z22.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z22.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #9\n"
+    "ld1b { z25.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z25.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1b { z31.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1b { z0.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z0.h, z0.h, z16.h\n"
+    "add z0.h, z0.h, z7.h\n"
+    "ld1b { z1.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1b { z2.s }, p1/Z, [x22]\n"
+    "trn1 z1.h, z1.h, z2.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z1.h, z1.h, z7.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17217a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+    "ld1b { z24.s }, p1/Z, [x22]\n"
+    "mov z2.d, z24.d\n"
+    ".inst 0xc17317c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+    ".inst 0xa0402a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc17817a9  // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
+    "add z2.h, z2.h, z7.h\n"
+    "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17917c9  // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
+    ".inst 0xc17317e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
+    "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17317e9  // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
+    "12:"  // Unpadded: 0 priming loads
+    "cmp x7, #0x2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "blt 22f\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1b { z23.s }, p1/Z, [x17]\n"
+    "sub x7, x7, #0x2\n"
+    "ld1b { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z25.h\n"
+    "sub x16, x16, #0x1\n"
+    "ld1b { z24.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "lsr x20, x7, #0x1\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z30.h\n"
+    "cmp x20, x16\n"
+    "ld1b { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "csel x26, x20, x16, LT\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1b { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z22.h\n"
+    "add z25.h, z25.h, z7.h\n"
+    "ld1b { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z22.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1b { z27.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "and x7, x7, #0x1\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z30.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1b { z28.s }, p1/Z, [x21]\n"
+    "mov z28.d, z28.d\n"
+    "add z28.h, z28.h, z7.h\n"
+    "sub x16, x16, x26\n"
+    "cbz x26, 21f\n"
+    "13:"  // Unpadded: Main loop
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x25, SP, #6\n"
+    "addvl x24, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa0402b20  // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
+    "add x23, x17, %x[ld_in_row]\n"
+    "addvl x22, SP, #3\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "addvl x21, SP, #9\n"
+    "subs x26, x26, #0x1\n"
+    ".inst 0xc1711709  // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xa0402b08  // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+    ".inst 0xc17816ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
+    "ld1b { z23.s }, p1/Z, [x17]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
+    ".inst 0xc179170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
+    "ld1b { z16.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1b { z24.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc179172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+    "ld1b { z18.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1b { z25.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "ld1b { z8.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z8.h\n"
+    "add z25.h, z25.h, z7.h\n"
+    "ld1b { z26.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "ld1b { z28.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z28.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1b { z27.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    "ld1b { z28.s }, p1/Z, [x23]\n"
+    "trn1 z27.h, z27.h, z28.h\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "add z27.h, z27.h, z7.h\n"
+    ".inst 0xa0402ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc17216e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    "ld1b { z20.s }, p1/Z, [x23]\n"
+    "mov z28.d, z20.d\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+    ".inst 0xc1711709  // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    ".inst 0xc1701728  // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "ld1b { z23.s }, p1/Z, [x17]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "ld1b { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x10, x10, x28\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z16.h\n"
+    "add x9, x9, x27\n"
+    "ld1b { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1b { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z25.h, z25.h, z7.h\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1b { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "mov z28.d, z16.d\n"
+    "add z28.h, z28.h, z7.h\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "bgt 13b\n"
+    "b 21f\n"
+    "14:"  // Padded
+    "cbz x22, 19f\n"
+    "cmp x22, #0x1\n"
+    "sub x7, x7, x22\n"
+    "beq 18f\n"
+    "cmp x22, #0x2\n"
+    "beq 17f\n"
+    "cmp x22, #0x3\n"
+    "beq 16f\n"
+    "15:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x17]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z28.s }, p0/Z, [x21]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    "trn1 z28.h, z28.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z29.s }, p0/Z, [x21]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x21]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z30.s }, p0/Z, [x21]\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z31.s }, p0/Z, [x21]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x20, SP, #12\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z18.h\n"
+    "trn1 z30.h, z30.h, z17.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1b { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "mov z0.d, z20.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1711788  // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
+    "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17117a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
+    "16:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z24.s }, p0/Z, [x17]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z25.s }, p0/Z, [x21]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z17.h\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z26.s }, p0/Z, [x21]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x21]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z27.s }, p0/Z, [x21]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z28.s }, p0/Z, [x21]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x20, SP, #9\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z18.h\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    "trn1 z28.h, z28.h, z16.h\n"
+    ".inst 0xc1721708  // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
+    "ld1b { z11.s }, p0/Z, [x21]\n"
+    "add z11.h, p0/M, z11.h, z7.h\n"
+    "mov z29.d, z11.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1731728  // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701748  // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
+    "17:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z25.s }, p0/Z, [x17]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x21, SP, #6\n"
+    "trn1 z27.h, z27.h, z18.h\n"
+    "trn1 z28.h, z28.h, z17.h\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z29.h, z29.h, z16.h\n"
+    ".inst 0xc1711728  // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #12\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    ".inst 0xc1791748  // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1721729  // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+    "mov z30.d, z1.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    ".inst 0xc1791768  // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701769  // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+    "18:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z25.s }, p0/Z, [x17]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x21, SP, #3\n"
+    "trn1 z27.h, z27.h, z18.h\n"
+    "trn1 z28.h, z28.h, z17.h\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z29.h, z29.h, z16.h\n"
+    ".inst 0xc1731728  // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+    "ld1b { z0.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #9\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    ".inst 0xc17b1748  // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1721729  // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+    "mov z30.d, z0.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701769  // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+    "19:"  // Padded: 0 priming loads
+    "cmp x7, #0x2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "blt 22f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z19.s }, p0/Z, [x20]\n"
+    "add z19.h, p0/M, z19.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "sub x7, x7, #0x2\n"
+    "sub x16, x16, #0x1\n"
+    "trn1 z25.h, z25.h, z19.h\n"
+    "trn1 z26.h, z26.h, z18.h\n"
+    "lsr x20, x7, #0x1\n"
+    "cmp x20, x16\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    "mov z28.d, z16.d\n"
+    "csel x25, x20, x16, LT\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "and x7, x7, #0x1\n"
+    "sub x16, x16, x25\n"
+    "cbz x25, 21f\n"
+    "20:"  // Padded: Main loop
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x24, SP, #6\n"
+    "addvl x23, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa1402b00  // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    "addvl x22, SP, #3\n"
+    ".inst 0xc1781709  // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+    ".inst 0xa1402ae3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #9\n"
+    "subs x25, x25, #0x1\n"
+    ".inst 0xc17316ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+    "ld1b { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17b170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc173172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z30.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "trn1 z24.h, z24.h, z1.h\n"
+    "trn1 z25.h, z25.h, z3.h\n"
+    "trn1 z26.h, z26.h, z30.h\n"
+    ".inst 0xa0402ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "trn1 z27.h, z27.h, z29.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17216e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x0\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17216e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+    "ld1b { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1731709  // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "mov z28.d, z20.d\n"
+    "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    ".inst 0xc1711728  // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+    "mov x12, #0x4\n"
+    "add z22.h, p0/M, z22.h, z7.h\n"
+    "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1711729  // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z31.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "trn1 z23.h, z23.h, z8.h\n"
+    "trn1 z24.h, z24.h, z22.h\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z25.h, z25.h, z28.h\n"
+    "trn1 z26.h, z26.h, z20.h\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "trn1 z27.h, z27.h, z31.h\n"
+    "mov z28.d, z1.d\n"
+    "bgt 20b\n"
+    "21:"  // Main loop tail
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x24, SP, #6\n"
+    "addvl x23, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa0402b08  // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17816e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
+    "add x22, x17, %x[ld_in_row]\n"
+    "addvl x21, SP, #3\n"
+    ".inst 0xc1791709  // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
+    ".inst 0xa1402ae3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+    "addvl x20, SP, #9\n"
+    ".inst 0xc17316ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+    "ld1b { z29.s }, p0/Z, [x17]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z8.s }, p0/Z, [x22]\n"
+    "add z8.h, p0/M, z8.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17b170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+    "ld1b { z30.s }, p0/Z, [x22]\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1b { z20.s }, p0/Z, [x22]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc172172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
+    "trn1 z29.h, z29.h, z8.h\n"
+    "ld1b { z31.s }, p0/Z, [x22]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z25.s }, p0/Z, [x22]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z0.s }, p0/Z, [x22]\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x22]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z1.s }, p0/Z, [x22]\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z28.s }, p0/Z, [x22]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "trn1 z30.h, z30.h, z20.h\n"
+    "trn1 z31.h, z31.h, z25.h\n"
+    "trn1 z0.h, z0.h, z17.h\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z1.h, z1.h, z28.h\n"
+    ".inst 0xc17317a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+    "ld1b { z22.s }, p0/Z, [x22]\n"
+    ".inst 0xc1a6ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+    "add z22.h, p0/M, z22.h, z7.h\n"
+    ".inst 0xc17b17c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1a4aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
+    ".inst 0xc17317a9  // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
+    "mov z2.d, z22.d\n"
+    "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17b17c9  // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
+    ".inst 0xc1aaab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+    ".inst 0xc17917e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1b5ccb8  // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
+    "st1b { z24.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z25.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc17817e9  // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "st1b { z27.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "22:"  // Main loop skip tail
+    "cbz x7, 23f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z24.s }, p0/Z, [x17]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z17.h\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z31.s }, p0/Z, [x20]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z26.h, z26.h, z17.h\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "ld1b { z0.s }, p0/Z, [x20]\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    "trn1 z28.h, z28.h, z31.h\n"
+    "addvl x21, SP, #6\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    "mov z29.d, z0.d\n"
+    "addvl x20, SP, #12\n"
+    "sub x16, x16, #0x1\n"
+    ".inst 0xc17b1728  // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
+    ".inst 0xa0402aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1721748  // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+    "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1781709  // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    ".inst 0xc1791729  // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    ".inst 0xc171170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    ".inst 0xc179172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    ".inst 0xc1721749  // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
+    "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    ".inst 0xc173174a  // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
+    "add x8, x8, #0x1\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "23:"  // Tail input: End
+    "cbz x16, 25f\n"
+    "24:"  // Right padding loop
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+    "subs x16, x16, #0x1\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    ".inst 0xc1aaab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+    ".inst 0xc1b5ccbc  // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+    "st1b { z28.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z29.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "st1b { z30.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z31.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "bgt 24b\n"
+    "25:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #16\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x6\n"
+    "whilelt p1.s, x6, x5\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #15\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..edee21e941
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..d807856ccb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x10, #0x0\n"
+    "mov x14, #0x0\n"
+    "1:"  // Tile loop
+    "str x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x2\n"
+    "mov x24, #0x2\n"
+    "str x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x10, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x10, x22\n"  // offset = tile_i * ld_output_row
+    "cnth x11\n"
+    "madd x21, x14, x13, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "madd x20, x14, x12, x20\n"  // offset += tile_j * ld_output_col
+    "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "ld1h { z27.h }, p3/Z, [x10]\n"
+    "add x27, x13, x13\n"
+    "mul x21, x21, x25\n"  // offset *= kernel_stride * output_size
+    "add x9, x9, x21, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "ld1h { z2.h }, p3/Z, [x10, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x10, #4, MUL VL]\n"
+    "add x26, x9, x23, LSL #1\n"
+    "ld1h { z4.h }, p3/Z, [x10, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x10, #6, MUL VL]\n"
+    "add x25, x26, x23, LSL #1\n"
+    "add x24, x27, x13\n"
+    "ld1h { z6.h }, p3/Z, [x10, #7, MUL VL]\n"
+    "addvl x10, x10, #16\n"
+    "add x28, x28, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "cmp x11, %x[n_channels]\n"
+    "add x23, x25, x23, LSL #1\n"
+    "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z7.h }, p3/Z, [x10, #-8, MUL VL]\n"
+    "add x22, x28, x22, LSL #1\n"
+    "mov x21, #0x0\n"
+    "ld1h { z8.h }, p3/Z, [x10, #-7, MUL VL]\n"
+    "ld1h { z9.h }, p2/Z, [x26, x13, LSL #1]\n"
+    "sub x20, XZR, x11\n"
+    "ld1h { z10.h }, p2/Z, [x9]\n"
+    "ld1h { z11.h }, p2/Z, [x9, x24, LSL #1]\n"
+    "addvl x10, x10, #-6\n"
+    "ld1h { z12.h }, p2/Z, [x26, x27, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x25, x13, LSL #1]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z24, z27\n fmla z24.h, p3/M, z4.h, z9.h\n"
+    "movprfx z23, z27\n fmla z23.h, p3/M, z3.h, z9.h\n"
+    "whilelt p1.h, x11, %x[n_channels]\n"
+    "inch x21\n"
+    "movprfx z22, z27\n fmla z22.h, p3/M, z1.h, z9.h\n"
+    "movprfx z21, z27\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z18.h }, p2/Z, [x23]\n"
+    "inch x11\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x24, LSL #1]\n"
+    "ld1h { z20.h }, p2/Z, [x25, x27, LSL #1]\n"
+    "fmla z22.h, p3/M, z2.h, z12.h\n"
+    "fmla z21.h, p3/M, z1.h, z12.h\n"
+    "mov p0.b, p2.b\n"
+    "ld1h { z27.h }, p3/Z, [x10]\n"
+    "fmla z24.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z16.h }, p2/Z, [x9, x13, LSL #1]\n"
+    "inch x20\n"
+    "fmla z22.h, p3/M, z6.h, z18.h\n"
+    "fmla z21.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z18.h }, p2/Z, [x9, x27, LSL #1]\n"
+    "addvl x9, x9, #1\n"
+    "fmla z24.h, p3/M, z7.h, z13.h\n"
+    "fmla z23.h, p3/M, z6.h, z13.h\n"
+    "fmla z22.h, p3/M, z4.h, z13.h\n"
+    "fmla z21.h, p3/M, z8.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x26]\n"
+    "fmla z24.h, p3/M, z1.h, z16.h\n"
+    "fmla z23.h, p3/M, z0.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x26, x24, LSL #1]\n"
+    "addvl x26, x26, #1\n"
+    "fmla z22.h, p3/M, z5.h, z20.h\n"
+    "fmla z21.h, p3/M, z4.h, z20.h\n"
+    "ld1h { z4.h }, p3/Z, [x10, #5, MUL VL]\n"
+    "fmla z24.h, p3/M, z2.h, z18.h\n"
+    "fmla z23.h, p3/M, z1.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x25]\n"
+    "ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
+    "fmla z22.h, p3/M, z0.h, z17.h\n"
+    "fmla z21.h, p3/M, z2.h, z16.h\n"
+    "ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
+    "ld1h { z2.h }, p3/Z, [x10, #3, MUL VL]\n"
+    "fmla z24.h, p3/M, z8.h, z20.h\n"
+    "fmla z23.h, p3/M, z7.h, z20.h\n"
+    "ld1h { z18.h }, p2/Z, [x25, x24, LSL #1]\n"
+    "addvl x25, x25, #1\n"
+    "fmla z22.h, p3/M, z3.h, z19.h\n"
+    "fmla z21.h, p3/M, z5.h, z18.h\n"
+    "ld1h { z13.h }, p1/Z, [x25, x13, LSL #1]\n"
+    "fmla z24.h, p3/M, z3.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x13, LSL #1]\n"
+    "fmla z23.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x27, LSL #1]\n"
+    "fmla z22.h, p3/M, z7.h, z17.h\n"
+    "fmla z21.h, p3/M, z6.h, z17.h\n"
+    "ld1h { z3.h }, p3/Z, [x10, #4, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x10, #6, MUL VL]\n"
+    "fmla z24.h, p3/M, z6.h, z19.h\n"
+    "fmla z23.h, p3/M, z8.h, z18.h\n"
+    "fmax z24.h, p3/M, z24.h, z26.h\n"
+    "fmax z23.h, p3/M, z23.h, z26.h\n"
+    "fmla z22.h, p3/M, z8.h, z16.h\n"
+    "fmla z21.h, p3/M, z7.h, z16.h\n"
+    "fmax z22.h, p3/M, z22.h, z26.h\n"
+    "fmax z21.h, p3/M, z21.h, z26.h\n"
+    "ld1h { z6.h }, p3/Z, [x10, #7, MUL VL]\n"
+    "addvl x10, x10, #16\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "ld1h { z9.h }, p1/Z, [x26, x13, LSL #1]\n"
+    "cmp x11, %x[n_channels]\n"
+    "fmin z24.h, p3/M, z24.h, z25.h\n"
+    "ld1h { z10.h }, p1/Z, [x9]\n"
+    "ld1h { z11.h }, p1/Z, [x9, x24, LSL #1]\n"
+    "fmin z23.h, p3/M, z23.h, z25.h\n"
+    "fmin z22.h, p3/M, z22.h, z25.h\n"
+    "ld1h { z12.h }, p1/Z, [x26, x27, LSL #1]\n"
+    "st1h { z24.h }, p0, [x28]\n"
+    "fmin z21.h, p3/M, z21.h, z25.h\n"
+    "addvl x23, x23, #1\n"
+    "st1h { z23.h }, p0, [x28, x12, LSL #1]\n"
+    "ld1h { z7.h }, p3/Z, [x10, #-8, MUL VL]\n"
+    "st1h { z22.h }, p0, [x22]\n"
+    "addvl x28, x28, #1\n"
+    "ld1h { z8.h }, p3/Z, [x10, #-7, MUL VL]\n"
+    "addvl x10, x10, #-6\n"
+    "st1h { z21.h }, p0, [x22, x12, LSL #1]\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z24, z27\n fmla z24.h, p3/M, z4.h, z9.h\n"
+    "movprfx z23, z27\n fmla z23.h, p3/M, z3.h, z9.h\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "movprfx z22, z27\n fmla z22.h, p3/M, z1.h, z9.h\n"
+    "movprfx z21, z27\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z18.h }, p2/Z, [x23]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x24, LSL #1]\n"
+    "ld1h { z20.h }, p2/Z, [x25, x27, LSL #1]\n"
+    "fmla z22.h, p3/M, z2.h, z12.h\n"
+    "fmla z21.h, p3/M, z1.h, z12.h\n"
+    "add x14, x14, #0x1\n"
+    "cmp x14, x20\n"
+    "fmla z24.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z16.h }, p2/Z, [x9, x13, LSL #1]\n"
+    "add x21, x10, #0x1\n"
+    "fmla z22.h, p3/M, z6.h, z18.h\n"
+    "fmla z21.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z18.h }, p2/Z, [x9, x27, LSL #1]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z24.h, p3/M, z7.h, z13.h\n"
+    "fmla z23.h, p3/M, z6.h, z13.h\n"
+    "csel x10, x10, x21, LT\n"
+    "mov p0.b, p2.b\n"
+    "fmla z22.h, p3/M, z4.h, z13.h\n"
+    "fmla z21.h, p3/M, z8.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x26]\n"
+    "csel x14, x14, XZR, LT\n"
+    "fmla z24.h, p3/M, z1.h, z16.h\n"
+    "fmla z23.h, p3/M, z0.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x26, x24, LSL #1]\n"
+    "cmp x10, x20\n"
+    "fmla z22.h, p3/M, z5.h, z20.h\n"
+    "fmla z21.h, p3/M, z4.h, z20.h\n"
+    "fmla z24.h, p3/M, z2.h, z18.h\n"
+    "fmla z23.h, p3/M, z1.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x25]\n"
+    "fmla z22.h, p3/M, z0.h, z17.h\n"
+    "fmla z21.h, p3/M, z2.h, z16.h\n"
+    "fmla z24.h, p3/M, z8.h, z20.h\n"
+    "fmla z23.h, p3/M, z7.h, z20.h\n"
+    "ld1h { z18.h }, p2/Z, [x25, x24, LSL #1]\n"
+    "fmla z22.h, p3/M, z3.h, z19.h\n"
+    "fmla z21.h, p3/M, z5.h, z18.h\n"
+    "fmla z24.h, p3/M, z3.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x13, LSL #1]\n"
+    "fmla z23.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x27, LSL #1]\n"
+    "fmla z22.h, p3/M, z7.h, z17.h\n"
+    "fmla z21.h, p3/M, z6.h, z17.h\n"
+    "fmla z24.h, p3/M, z6.h, z19.h\n"
+    "fmla z23.h, p3/M, z8.h, z18.h\n"
+    "fmax z24.h, p3/M, z24.h, z26.h\n"
+    "fmax z23.h, p3/M, z23.h, z26.h\n"
+    "fmla z22.h, p3/M, z8.h, z16.h\n"
+    "fmla z21.h, p3/M, z7.h, z16.h\n"
+    "fmax z22.h, p3/M, z22.h, z26.h\n"
+    "fmax z21.h, p3/M, z21.h, z26.h\n"
+    "fmin z24.h, p3/M, z24.h, z25.h\n"
+    "fmin z23.h, p3/M, z23.h, z25.h\n"
+    "st1h { z24.h }, p0, [x28]\n"
+    "fmin z22.h, p3/M, z22.h, z25.h\n"
+    "fmin z21.h, p3/M, z21.h, z25.h\n"
+    "st1h { z23.h }, p0, [x28, x12, LSL #1]\n"
+    "st1h { z22.h }, p0, [x22]\n"
+    "st1h { z21.h }, p0, [x22, x12, LSL #1]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..90982b6990
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[16];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[5];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[3];
+      inptrs[3] = input_ptrs[6];
+      inptrs[4] = input_ptrs[9];
+      inptrs[5] = input_ptrs[12];
+      inptrs[6] = input_ptrs[15];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[2];
+      inptrs[9] = input_ptrs[10];
+      inptrs[10] = input_ptrs[4];
+      inptrs[11] = input_ptrs[7];
+      inptrs[12] = input_ptrs[8];
+      inptrs[13] = input_ptrs[11];
+      inptrs[14] = input_ptrs[13];
+      inptrs[15] = input_ptrs[14];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "cnth x14\n"
+    "ldp x13, x12, [x20, #0x0]\n"
+    "ldp x11, x10, [x20, #0x10]\n"
+    "mov x9, #0x0\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z20.h }, p3/Z, [x16]\n"
+    "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
+    "cmp x14, %x[n_channels]\n"
+    "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
+    "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
+    "sub x28, XZR, x14\n"
+    "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
+    "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
+    "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
+    "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
+    "ld1h { z9.h }, p2/Z, [x24, x9, LSL #1]\n"
+    "addvl x16, x16, #-6\n"
+    "ld1h { z10.h }, p2/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z24, z20\n fmla z24.h, p3/M, z4.h, z9.h\n"
+    "movprfx z23, z20\n fmla z23.h, p3/M, z3.h, z9.h\n"
+    "ldr x21, [x15, #0x28]\n"
+    "ldr x20, [x15, #0x30]\n"
+    "movprfx z22, z20\n fmla z22.h, p3/M, z1.h, z9.h\n"
+    "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ldr x22, [x15, #0x38]\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x48]\n"
+    "fmla z22.h, p3/M, z2.h, z12.h\n"
+    "fmla z21.h, p3/M, z1.h, z12.h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "ld1h { z20.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "fmla z24.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+    "ldr x22, [x15, #0x50]\n"
+    "fmla z22.h, p3/M, z6.h, z18.h\n"
+    "fmla z21.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x58]\n"
+    "fmla z24.h, p3/M, z7.h, z13.h\n"
+    "fmla z23.h, p3/M, z6.h, z13.h\n"
+    "ldr x20, [x15, #0x60]\n"
+    "ldr x27, [x15, #0x68]\n"
+    "fmla z22.h, p3/M, z4.h, z13.h\n"
+    "fmla z21.h, p3/M, z8.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x22, x9, LSL #1]\n"
+    "ldr x26, [x15, #0x70]\n"
+    "fmla z24.h, p3/M, z1.h, z16.h\n"
+    "fmla z23.h, p3/M, z0.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ldr x25, [x15, #0x78]\n"
+    "fmla z22.h, p3/M, z5.h, z20.h\n"
+    "fmla z21.h, p3/M, z4.h, z20.h\n"
+    "whilelt p1.h, x14, %x[n_channels]\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "fmla z24.h, p3/M, z2.h, z18.h\n"
+    "fmla z23.h, p3/M, z1.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "fmla z22.h, p3/M, z0.h, z17.h\n"
+    "fmla z21.h, p3/M, z2.h, z16.h\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ld1h { z13.h }, p1/Z, [x20, x14, LSL #1]\n"
+    "fmla z24.h, p3/M, z8.h, z20.h\n"
+    "fmla z23.h, p3/M, z7.h, z20.h\n"
+    "ld1h { z18.h }, p2/Z, [x27, x9, LSL #1]\n"
+    "inch x28\n"
+    "fmla z22.h, p3/M, z3.h, z19.h\n"
+    "fmla z21.h, p3/M, z5.h, z18.h\n"
+    "mov p0.b, p2.b\n"
+    "ld1h { z20.h }, p3/Z, [x16]\n"
+    "fmla z24.h, p3/M, z3.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x26, x9, LSL #1]\n"
+    "fmla z23.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x25, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z7.h, z17.h\n"
+    "fmla z21.h, p3/M, z6.h, z17.h\n"
+    "inch x9\n"
+    "ld1h { z11.h }, p1/Z, [x22, x14, LSL #1]\n"
+    "fmla z24.h, p3/M, z6.h, z19.h\n"
+    "fmla z23.h, p3/M, z8.h, z18.h\n"
+    "ld1h { z9.h }, p1/Z, [x24, x14, LSL #1]\n"
+    "ld1h { z10.h }, p1/Z, [x23, x14, LSL #1]\n"
+    "fmla z22.h, p3/M, z8.h, z16.h\n"
+    "fmla z21.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z12.h }, p1/Z, [x21, x14, LSL #1]\n"
+    "inch x14\n"
+    "fmax z24.h, p3/M, z24.h, z26.h\n"
+    "fmax z23.h, p3/M, z23.h, z26.h\n"
+    "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
+    "fmax z22.h, p3/M, z22.h, z26.h\n"
+    "fmax z21.h, p3/M, z21.h, z26.h\n"
+    "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
+    "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
+    "whilelt p2.h, x9, %x[n_channels]\n"
+    "cmp x14, %x[n_channels]\n"
+    "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "fmin z24.h, p3/M, z24.h, z25.h\n"
+    "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+    "fmin z23.h, p3/M, z23.h, z25.h\n"
+    "fmin z22.h, p3/M, z22.h, z25.h\n"
+    "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
+    "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
+    "fmin z21.h, p3/M, z21.h, z25.h\n"
+    "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
+    "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
+    "addvl x16, x16, #-6\n"
+    "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z24, z20\n fmla z24.h, p3/M, z4.h, z9.h\n"
+    "movprfx z23, z20\n fmla z23.h, p3/M, z3.h, z9.h\n"
+    "ldr x21, [x15, #0x28]\n"
+    "ldr x20, [x15, #0x30]\n"
+    "movprfx z22, z20\n fmla z22.h, p3/M, z1.h, z9.h\n"
+    "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ldr x22, [x15, #0x38]\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x48]\n"
+    "fmla z22.h, p3/M, z2.h, z12.h\n"
+    "fmla z21.h, p3/M, z1.h, z12.h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "ld1h { z20.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "fmla z24.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x50]\n"
+    "fmla z22.h, p3/M, z6.h, z18.h\n"
+    "fmla z21.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x20, [x15, #0x58]\n"
+    "fmla z24.h, p3/M, z7.h, z13.h\n"
+    "fmla z23.h, p3/M, z6.h, z13.h\n"
+    "ldr x23, [x15, #0x60]\n"
+    "ldr x22, [x15, #0x68]\n"
+    "fmla z22.h, p3/M, z4.h, z13.h\n"
+    "fmla z21.h, p3/M, z8.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x70]\n"
+    "fmla z24.h, p3/M, z1.h, z16.h\n"
+    "fmla z23.h, p3/M, z0.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "fmla z22.h, p3/M, z5.h, z20.h\n"
+    "fmla z21.h, p3/M, z4.h, z20.h\n"
+    "inch x28\n"
+    "mov p0.b, p2.b\n"
+    "fmla z24.h, p3/M, z2.h, z18.h\n"
+    "fmla z23.h, p3/M, z1.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x23, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z0.h, z17.h\n"
+    "fmla z21.h, p3/M, z2.h, z16.h\n"
+    "fmla z24.h, p3/M, z8.h, z20.h\n"
+    "fmla z23.h, p3/M, z7.h, z20.h\n"
+    "ld1h { z18.h }, p2/Z, [x22, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z3.h, z19.h\n"
+    "fmla z21.h, p3/M, z5.h, z18.h\n"
+    "fmla z24.h, p3/M, z3.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "fmla z23.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z7.h, z17.h\n"
+    "fmla z21.h, p3/M, z6.h, z17.h\n"
+    "fmla z24.h, p3/M, z6.h, z19.h\n"
+    "fmla z23.h, p3/M, z8.h, z18.h\n"
+    "fmax z24.h, p3/M, z24.h, z26.h\n"
+    "fmax z23.h, p3/M, z23.h, z26.h\n"
+    "fmla z22.h, p3/M, z8.h, z16.h\n"
+    "fmla z21.h, p3/M, z7.h, z16.h\n"
+    "fmax z22.h, p3/M, z22.h, z26.h\n"
+    "fmax z21.h, p3/M, z21.h, z26.h\n"
+    "fmin z24.h, p3/M, z24.h, z25.h\n"
+    "fmin z23.h, p3/M, z23.h, z25.h\n"
+    "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+    "fmin z22.h, p3/M, z22.h, z25.h\n"
+    "fmin z21.h, p3/M, z21.h, z25.h\n"
+    "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
+    "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
+    "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..da2ef72a30
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..a22ab39d6f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x13, #0x0\n"
+    "mov x8, #0x0\n"
+    "1:"  // Tile loop
+    "str x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x3\n"
+    "mov x24, #0x3\n"
+    "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x13, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x8, x17, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "cnth x15\n"
+    "mul x20, x13, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x12, x17, x17\n"
+    "mul x22, x22, x25\n"  // offset *= kernel_stride * output_size
+    "add x14, x14, x22, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x10, x14, x23, LSL #1\n"
+    "madd x20, x8, x16, x20\n"  // offset += tile_j * ld_output_col
+    "add x9, x10, x23, LSL #1\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z14.h }, p3/Z, [x13]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "ld1h { z0.h }, p3/Z, [x13, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x13, #2, MUL VL]\n"
+    "add x28, x9, x23, LSL #1\n"
+    "ld1h { z2.h }, p3/Z, [x13, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x13, #4, MUL VL]\n"
+    "add x27, x12, x17\n"
+    "add x11, x11, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "ld1h { z4.h }, p3/Z, [x13, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x13, #6, MUL VL]\n"
+    "add x26, x28, x23, LSL #1\n"
+    "add x25, x27, x17\n"
+    "ld1h { z6.h }, p3/Z, [x13, #7, MUL VL]\n"
+    "addvl x13, x13, #16\n"
+    "add x24, x11, x21, LSL #1\n"
+    "ld1rh { z31.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "cmp x15, %x[n_channels]\n"
+    "add x23, x24, x21, LSL #1\n"
+    "ld1rh { z30.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z7.h }, p3/Z, [x13, #-8, MUL VL]\n"
+    "add x22, x16, x16\n"
+    "mov x21, #0x0\n"
+    "ld1h { z8.h }, p3/Z, [x13, #-7, MUL VL]\n"
+    "ld1h { z9.h }, p2/Z, [x9, x12, LSL #1]\n"
+    "sub x20, XZR, x15\n"
+    "ld1h { z10.h }, p2/Z, [x14]\n"
+    "ld1h { z11.h }, p2/Z, [x14, x25, LSL #1]\n"
+    "addvl x13, x13, #-6\n"
+    "ld1h { z12.h }, p2/Z, [x26]\n"
+    "ld1h { z13.h }, p2/Z, [x10, x12, LSL #1]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z29, z14\n fmla z29.h, p3/M, z7.h, z9.h\n"
+    "movprfx z28, z14\n fmla z28.h, p3/M, z8.h, z9.h\n"
+    "whilelt p1.h, x15, %x[n_channels]\n"
+    "inch x21\n"
+    "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+    "fmla z29.h, p3/M, z4.h, z13.h\n"
+    "inch x15\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+    "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+    "inch x20\n"
+    "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z23.h }, p2/Z, [x9, x27, LSL #1]\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z18.h }, p2/Z, [x9, x17, LSL #1]\n"
+    "movprfx z22, z14\n fmla z22.h, p3/M, z2.h, z9.h\n"
+    "fmla z29.h, p3/M, z6.h, z18.h\n"
+    "movprfx z21, z14\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "fmla z28.h, p3/M, z5.h, z13.h\n"
+    "fmla z27.h, p3/M, z3.h, z13.h\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "fmla z25.h, p3/M, z1.h, z13.h\n"
+    "fmla z24.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z17.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z22.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z16.h }, p2/Z, [x26, x25, LSL #1]\n"
+    "movprfx z20, z14\n fmla z20.h, p3/M, z1.h, z9.h\n"
+    "fmla z29.h, p3/M, z0.h, z17.h\n"
+    "ld1h { z14.h }, p3/Z, [x13]\n"
+    "fmla z21.h, p3/M, z8.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x14, x27, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z18.h\n"
+    "fmla z20.h, p3/M, z0.h, z18.h\n"
+    "fmla z26.h, p3/M, z4.h, z18.h\n"
+    "fmla z25.h, p3/M, z3.h, z18.h\n"
+    "fmla z22.h, p3/M, z1.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x10]\n"
+    "fmla z29.h, p3/M, z2.h, z16.h\n"
+    "fmla z27.h, p3/M, z1.h, z16.h\n"
+    "ld1h { z18.h }, p2/Z, [x28]\n"
+    "fmla z24.h, p3/M, z4.h, z23.h\n"
+    "fmla z28.h, p3/M, z1.h, z17.h\n"
+    "ld1h { z16.h }, p2/Z, [x10, x25, LSL #1]\n"
+    "fmla z20.h, p3/M, z2.h, z23.h\n"
+    "fmla z21.h, p3/M, z1.h, z23.h\n"
+    "fmla z29.h, p3/M, z8.h, z23.h\n"
+    "fmla z27.h, p3/M, z7.h, z23.h\n"
+    "fmla z25.h, p3/M, z5.h, z23.h\n"
+    "fmla z26.h, p3/M, z0.h, z19.h\n"
+    "ld1h { z17.h }, p2/Z, [x28, x12, LSL #1]\n"
+    "fmla z22.h, p3/M, z3.h, z18.h\n"
+    "fmla z24.h, p3/M, z2.h, z16.h\n"
+    "fmla z20.h, p3/M, z4.h, z17.h\n"
+    "fmla z21.h, p3/M, z3.h, z17.h\n"
+    "fmla z28.h, p3/M, z3.h, z19.h\n"
+    "fmla z27.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z19.h }, p2/Z, [x28, x25, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x26, x17, LSL #1]\n"
+    "fmla z26.h, p3/M, z6.h, z18.h\n"
+    "fmla z25.h, p3/M, z7.h, z17.h\n"
+    "ld1h { z18.h }, p2/Z, [x10, x17, LSL #1]\n"
+    "fmla z22.h, p3/M, z5.h, z17.h\n"
+    "fmla z24.h, p3/M, z6.h, z17.h\n"
+    "fmla z21.h, p3/M, z5.h, z19.h\n"
+    "fmla z20.h, p3/M, z6.h, z16.h\n"
+    "fmla z26.h, p3/M, z8.h, z17.h\n"
+    "fmla z22.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x26, x27, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z18.h\n"
+    "fmla z25.h, p3/M, z0.h, z18.h\n"
+    "fmla z24.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x10, x27, LSL #1]\n"
+    "fmla z20.h, p3/M, z8.h, z17.h\n"
+    "addvl x10, x10, #1\n"
+    "fmla z21.h, p3/M, z7.h, z17.h\n"
+    "fmla z28.h, p3/M, z4.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
+    "fmla z26.h, p3/M, z1.h, z18.h\n"
+    "fmla z29.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x28, x17, LSL #1]\n"
+    "addvl x28, x28, #1\n"
+    "fmla z27.h, p3/M, z4.h, z16.h\n"
+    "fmla z25.h, p3/M, z2.h, z16.h\n"
+    "fmla z24.h, p3/M, z1.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x14, x12, LSL #1]\n"
+    "fmla z22.h, p3/M, z4.h, z17.h\n"
+    "addvl x14, x14, #1\n"
+    "fmla z20.h, p3/M, z3.h, z17.h\n"
+    "fmla z21.h, p3/M, z4.h, z19.h\n"
+    "ld1h { z4.h }, p3/Z, [x13, #5, MUL VL]\n"
+    "ld1h { z10.h }, p1/Z, [x14]\n"
+    "fmla z26.h, p3/M, z7.h, z17.h\n"
+    "fmla z25.h, p3/M, z6.h, z17.h\n"
+    "ld1h { z18.h }, p2/Z, [x9]\n"
+    "fmla z28.h, p3/M, z2.h, z16.h\n"
+    "fmla z29.h, p3/M, z1.h, z16.h\n"
+    "fmax z29.h, p3/M, z29.h, z31.h\n"
+    "ld1h { z1.h }, p3/Z, [x13, #2, MUL VL]\n"
+    "fmla z27.h, p3/M, z0.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x9, x25, LSL #1]\n"
+    "fmla z24.h, p3/M, z7.h, z19.h\n"
+    "addvl x9, x9, #1\n"
+    "fmla z20.h, p3/M, z5.h, z19.h\n"
+    "fmla z22.h, p3/M, z0.h, z18.h\n"
+    "ld1h { z0.h }, p3/Z, [x13, #1, MUL VL]\n"
+    "fmin z29.h, p3/M, z29.h, z30.h\n"
+    "fmla z21.h, p3/M, z2.h, z17.h\n"
+    "fmla z25.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x26, x12, LSL #1]\n"
+    "fmax z25.h, p3/M, z25.h, z31.h\n"
+    "fmla z28.h, p3/M, z6.h, z18.h\n"
+    "fmla z26.h, p3/M, z3.h, z18.h\n"
+    "fmax z28.h, p3/M, z28.h, z31.h\n"
+    "fmax z26.h, p3/M, z26.h, z31.h\n"
+    "fmla z27.h, p3/M, z8.h, z17.h\n"
+    "fmla z24.h, p3/M, z5.h, z17.h\n"
+    "fmax z27.h, p3/M, z27.h, z31.h\n"
+    "fmax z24.h, p3/M, z24.h, z31.h\n"
+    "fmla z22.h, p3/M, z8.h, z16.h\n"
+    "fmla z20.h, p3/M, z7.h, z16.h\n"
+    "fmax z22.h, p3/M, z22.h, z31.h\n"
+    "fmax z20.h, p3/M, z20.h, z31.h\n"
+    "fmla z21.h, p3/M, z6.h, z16.h\n"
+    "fmax z21.h, p3/M, z21.h, z31.h\n"
+    "addvl x26, x26, #1\n"
+    "ld1h { z2.h }, p3/Z, [x13, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x13, #4, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x13, #6, MUL VL]\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "cmp x15, %x[n_channels]\n"
+    "ld1h { z6.h }, p3/Z, [x13, #7, MUL VL]\n"
+    "addvl x13, x13, #16\n"
+    "fmin z28.h, p3/M, z28.h, z30.h\n"
+    "ld1h { z9.h }, p1/Z, [x9, x12, LSL #1]\n"
+    "fmin z27.h, p3/M, z27.h, z30.h\n"
+    "fmin z26.h, p3/M, z26.h, z30.h\n"
+    "ld1h { z11.h }, p1/Z, [x14, x25, LSL #1]\n"
+    "ld1h { z12.h }, p1/Z, [x26]\n"
+    "fmin z25.h, p3/M, z25.h, z30.h\n"
+    "fmin z24.h, p3/M, z24.h, z30.h\n"
+    "ld1h { z13.h }, p1/Z, [x10, x12, LSL #1]\n"
+    "st1h { z28.h }, p0, [x11]\n"
+    "fmin z22.h, p3/M, z22.h, z30.h\n"
+    "fmin z20.h, p3/M, z20.h, z30.h\n"
+    "st1h { z29.h }, p0, [x11, x16, LSL #1]\n"
+    "ld1h { z7.h }, p3/Z, [x13, #-8, MUL VL]\n"
+    "fmin z21.h, p3/M, z21.h, z30.h\n"
+    "st1h { z27.h }, p0, [x11, x22, LSL #1]\n"
+    "addvl x11, x11, #1\n"
+    "ld1h { z8.h }, p3/Z, [x13, #-7, MUL VL]\n"
+    "st1h { z26.h }, p0, [x24]\n"
+    "addvl x13, x13, #-6\n"
+    "st1h { z25.h }, p0, [x24, x16, LSL #1]\n"
+    "st1h { z24.h }, p0, [x24, x22, LSL #1]\n"
+    "addvl x24, x24, #1\n"
+    "st1h { z22.h }, p0, [x23]\n"
+    "st1h { z20.h }, p0, [x23, x16, LSL #1]\n"
+    "st1h { z21.h }, p0, [x23, x22, LSL #1]\n"
+    "addvl x23, x23, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z29, z14\n fmla z29.h, p3/M, z7.h, z9.h\n"
+    "movprfx z28, z14\n fmla z28.h, p3/M, z8.h, z9.h\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+    "fmla z29.h, p3/M, z4.h, z13.h\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "add x8, x8, #0x1\n"
+    "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+    "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+    "cmp x8, x20\n"
+    "add x21, x13, #0x1\n"
+    "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z23.h }, p2/Z, [x9, x27, LSL #1]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z18.h }, p2/Z, [x9, x17, LSL #1]\n"
+    "movprfx z22, z14\n fmla z22.h, p3/M, z2.h, z9.h\n"
+    "csel x13, x13, x21, LT\n"
+    "fmla z29.h, p3/M, z6.h, z18.h\n"
+    "movprfx z21, z14\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "mov p0.b, p2.b\n"
+    "csel x8, x8, XZR, LT\n"
+    "fmla z28.h, p3/M, z5.h, z13.h\n"
+    "fmla z27.h, p3/M, z3.h, z13.h\n"
+    "cmp x13, x20\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "fmla z25.h, p3/M, z1.h, z13.h\n"
+    "fmla z24.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z17.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z22.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z16.h }, p2/Z, [x26, x25, LSL #1]\n"
+    "movprfx z20, z14\n fmla z20.h, p3/M, z1.h, z9.h\n"
+    "fmla z29.h, p3/M, z0.h, z17.h\n"
+    "fmla z21.h, p3/M, z8.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x14, x27, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z18.h\n"
+    "fmla z20.h, p3/M, z0.h, z18.h\n"
+    "fmla z26.h, p3/M, z4.h, z18.h\n"
+    "fmla z25.h, p3/M, z3.h, z18.h\n"
+    "fmla z22.h, p3/M, z1.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x10]\n"
+    "fmla z29.h, p3/M, z2.h, z16.h\n"
+    "fmla z27.h, p3/M, z1.h, z16.h\n"
+    "ld1h { z18.h }, p2/Z, [x28]\n"
+    "fmla z24.h, p3/M, z4.h, z23.h\n"
+    "fmla z28.h, p3/M, z1.h, z17.h\n"
+    "ld1h { z16.h }, p2/Z, [x10, x25, LSL #1]\n"
+    "fmla z20.h, p3/M, z2.h, z23.h\n"
+    "fmla z21.h, p3/M, z1.h, z23.h\n"
+    "fmla z29.h, p3/M, z8.h, z23.h\n"
+    "fmla z27.h, p3/M, z7.h, z23.h\n"
+    "fmla z25.h, p3/M, z5.h, z23.h\n"
+    "fmla z26.h, p3/M, z0.h, z19.h\n"
+    "ld1h { z17.h }, p2/Z, [x28, x12, LSL #1]\n"
+    "fmla z22.h, p3/M, z3.h, z18.h\n"
+    "fmla z24.h, p3/M, z2.h, z16.h\n"
+    "fmla z20.h, p3/M, z4.h, z17.h\n"
+    "fmla z21.h, p3/M, z3.h, z17.h\n"
+    "fmla z28.h, p3/M, z3.h, z19.h\n"
+    "fmla z27.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z19.h }, p2/Z, [x28, x25, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x26, x17, LSL #1]\n"
+    "fmla z26.h, p3/M, z6.h, z18.h\n"
+    "fmla z25.h, p3/M, z7.h, z17.h\n"
+    "ld1h { z18.h }, p2/Z, [x10, x17, LSL #1]\n"
+    "fmla z22.h, p3/M, z5.h, z17.h\n"
+    "fmla z24.h, p3/M, z6.h, z17.h\n"
+    "fmla z21.h, p3/M, z5.h, z19.h\n"
+    "fmla z20.h, p3/M, z6.h, z16.h\n"
+    "fmla z26.h, p3/M, z8.h, z17.h\n"
+    "fmla z22.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x26, x27, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z18.h\n"
+    "fmla z25.h, p3/M, z0.h, z18.h\n"
+    "fmla z24.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x10, x27, LSL #1]\n"
+    "fmla z20.h, p3/M, z8.h, z17.h\n"
+    "fmla z21.h, p3/M, z7.h, z17.h\n"
+    "fmla z28.h, p3/M, z4.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
+    "fmla z26.h, p3/M, z1.h, z18.h\n"
+    "fmla z29.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x28, x17, LSL #1]\n"
+    "fmla z27.h, p3/M, z4.h, z16.h\n"
+    "fmla z25.h, p3/M, z2.h, z16.h\n"
+    "fmla z24.h, p3/M, z1.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x14, x12, LSL #1]\n"
+    "fmla z22.h, p3/M, z4.h, z17.h\n"
+    "fmla z20.h, p3/M, z3.h, z17.h\n"
+    "fmla z21.h, p3/M, z4.h, z19.h\n"
+    "fmla z26.h, p3/M, z7.h, z17.h\n"
+    "fmla z25.h, p3/M, z6.h, z17.h\n"
+    "ld1h { z18.h }, p2/Z, [x9]\n"
+    "fmla z28.h, p3/M, z2.h, z16.h\n"
+    "fmla z29.h, p3/M, z1.h, z16.h\n"
+    "fmax z29.h, p3/M, z29.h, z31.h\n"
+    "fmin z29.h, p3/M, z29.h, z30.h\n"
+    "fmla z27.h, p3/M, z0.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x9, x25, LSL #1]\n"
+    "fmla z24.h, p3/M, z7.h, z19.h\n"
+    "fmla z20.h, p3/M, z5.h, z19.h\n"
+    "fmla z22.h, p3/M, z0.h, z18.h\n"
+    "fmla z21.h, p3/M, z2.h, z17.h\n"
+    "fmla z25.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x26, x12, LSL #1]\n"
+    "fmax z25.h, p3/M, z25.h, z31.h\n"
+    "fmla z28.h, p3/M, z6.h, z18.h\n"
+    "fmla z26.h, p3/M, z3.h, z18.h\n"
+    "fmax z28.h, p3/M, z28.h, z31.h\n"
+    "fmax z26.h, p3/M, z26.h, z31.h\n"
+    "fmla z27.h, p3/M, z8.h, z17.h\n"
+    "fmla z24.h, p3/M, z5.h, z17.h\n"
+    "fmax z27.h, p3/M, z27.h, z31.h\n"
+    "fmax z24.h, p3/M, z24.h, z31.h\n"
+    "fmla z22.h, p3/M, z8.h, z16.h\n"
+    "fmla z20.h, p3/M, z7.h, z16.h\n"
+    "fmax z22.h, p3/M, z22.h, z31.h\n"
+    "fmax z20.h, p3/M, z20.h, z31.h\n"
+    "fmla z21.h, p3/M, z6.h, z16.h\n"
+    "fmax z21.h, p3/M, z21.h, z31.h\n"
+    "fmin z28.h, p3/M, z28.h, z30.h\n"
+    "st1h { z28.h }, p0, [x11]\n"
+    "fmin z27.h, p3/M, z27.h, z30.h\n"
+    "fmin z26.h, p3/M, z26.h, z30.h\n"
+    "st1h { z29.h }, p0, [x11, x16, LSL #1]\n"
+    "fmin z25.h, p3/M, z25.h, z30.h\n"
+    "fmin z24.h, p3/M, z24.h, z30.h\n"
+    "st1h { z27.h }, p0, [x11, x22, LSL #1]\n"
+    "fmin z22.h, p3/M, z22.h, z30.h\n"
+    "fmin z20.h, p3/M, z20.h, z30.h\n"
+    "st1h { z26.h }, p0, [x24]\n"
+    "fmin z21.h, p3/M, z21.h, z30.h\n"
+    "st1h { z25.h }, p0, [x24, x16, LSL #1]\n"
+    "st1h { z24.h }, p0, [x24, x22, LSL #1]\n"
+    "st1h { z22.h }, p0, [x23]\n"
+    "st1h { z20.h }, p0, [x23, x16, LSL #1]\n"
+    "st1h { z21.h }, p0, [x23, x22, LSL #1]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..4f8368acd5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[25];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[4];
+      inptrs[3] = input_ptrs[20];
+      inptrs[4] = input_ptrs[7];
+      inptrs[5] = input_ptrs[24];
+      inptrs[6] = input_ptrs[11];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[3];
+      inptrs[9] = input_ptrs[13];
+      inptrs[10] = input_ptrs[5];
+      inptrs[11] = input_ptrs[9];
+      inptrs[12] = input_ptrs[15];
+      inptrs[13] = input_ptrs[17];
+      inptrs[14] = input_ptrs[19];
+      inptrs[15] = input_ptrs[21];
+      inptrs[16] = input_ptrs[6];
+      inptrs[17] = input_ptrs[8];
+      inptrs[18] = input_ptrs[23];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[18];
+      inptrs[22] = input_ptrs[10];
+      inptrs[23] = input_ptrs[14];
+      inptrs[24] = input_ptrs[22];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1h { z14.h }, p3/Z, [x8]\n"
+    "cnth x16\n"
+    "mov x15, #0x0\n"
+    "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
+    "cmp x16, %x[n_channels]\n"
+    "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n"
+    "sub x14, XZR, x16\n"
+    "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n"
+    "addvl x8, x8, #16\n"
+    "ldp x24, x23, [x17, #0x0]\n"
+    "ldp x22, x21, [x17, #0x10]\n"
+    "ldr x20, [x17, #0x20]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ld1rh { z31.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rh { z30.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n"
+    "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n"
+    "ld1h { z9.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "addvl x8, x8, #-6\n"
+    "ld1h { z10.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z29, z14\n fmla z29.h, p3/M, z8.h, z9.h\n"
+    "movprfx z28, z14\n fmla z28.h, p3/M, z7.h, z9.h\n"
+    "ldr x23, [x17, #0x30]\n"
+    "ldr x26, [x17, #0x38]\n"
+    "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+    "fmla z29.h, p3/M, z0.h, z10.h\n"
+    "ldr x22, [x17, #0x28]\n"
+    "ldr x21, [x17, #0x48]\n"
+    "fmla z28.h, p3/M, z4.h, z13.h\n"
+    "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+    "ldr x20, [x17, #0x40]\n"
+    "ld1h { z19.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+    "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+    "ldr x25, [x17, #0x50]\n"
+    "ldr x24, [x17, #0x58]\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "movprfx z23, z14\n fmla z23.h, p3/M, z2.h, z9.h\n"
+    "ldr x23, [x17, #0x60]\n"
+    "fmla z29.h, p3/M, z5.h, z13.h\n"
+    "fmla z28.h, p3/M, z6.h, z18.h\n"
+    "ldr x12, [x17, #0x70]\n"
+    "ldr x11, [x17, #0x88]\n"
+    "movprfx z22, z14\n fmla z22.h, p3/M, z0.h, z9.h\n"
+    "fmla z27.h, p3/M, z3.h, z13.h\n"
+    "inch x14\n"
+    "mov p1.b, p2.b\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "fmla z25.h, p3/M, z1.h, z13.h\n"
+    "ldr x10, [x13, #0x0]\n"
+    "whilelt p0.h, x16, %x[n_channels]\n"
+    "fmla z24.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z17.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z23.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "movprfx z21, z14\n fmla z21.h, p3/M, z1.h, z9.h\n"
+    "fmla z29.h, p3/M, z7.h, z18.h\n"
+    "ldr x22, [x17, #0x68]\n"
+    "ldr x21, [x17, #0x78]\n"
+    "fmla z28.h, p3/M, z0.h, z17.h\n"
+    "fmla z22.h, p3/M, z8.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "ldr x20, [x17, #0x80]\n"
+    "fmla z26.h, p3/M, z4.h, z18.h\n"
+    "fmla z25.h, p3/M, z3.h, z18.h\n"
+    "ldr x9, [x13, #0x8]\n"
+    "ldr x28, [x13, #0x10]\n"
+    "fmla z21.h, p3/M, z0.h, z18.h\n"
+    "fmla z24.h, p3/M, z4.h, z19.h\n"
+    "ldr x27, [x13, #0x18]\n"
+    "ld1h { z14.h }, p3/Z, [x8]\n"
+    "fmla z23.h, p3/M, z1.h, z18.h\n"
+    "fmla z29.h, p3/M, z1.h, z17.h\n"
+    "ld1h { z20.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ld1h { z17.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z2.h, z16.h\n"
+    "fmla z27.h, p3/M, z1.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "ldr x26, [x17, #0x90]\n"
+    "fmla z25.h, p3/M, z5.h, z19.h\n"
+    "fmla z21.h, p3/M, z2.h, z19.h\n"
+    "ldr x25, [x17, #0xa0]\n"
+    "ldr x24, [x17, #0x98]\n"
+    "fmla z26.h, p3/M, z0.h, z20.h\n"
+    "fmla z24.h, p3/M, z2.h, z17.h\n"
+    "fmla z28.h, p3/M, z8.h, z19.h\n"
+    "fmla z27.h, p3/M, z7.h, z19.h\n"
+    "fmla z22.h, p3/M, z1.h, z19.h\n"
+    "fmla z23.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z18.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ldr x23, [x17, #0xa8]\n"
+    "fmla z26.h, p3/M, z6.h, z16.h\n"
+    "fmla z25.h, p3/M, z7.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "ldr x22, [x17, #0xc0]\n"
+    "fmla z24.h, p3/M, z6.h, z18.h\n"
+    "fmla z21.h, p3/M, z4.h, z18.h\n"
+    "fmla z29.h, p3/M, z3.h, z20.h\n"
+    "fmla z27.h, p3/M, z5.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "fmla z23.h, p3/M, z5.h, z18.h\n"
+    "fmla z22.h, p3/M, z3.h, z18.h\n"
+    "ldr x21, [x17, #0xb0]\n"
+    "ldr x20, [x17, #0xb8]\n"
+    "fmla z26.h, p3/M, z8.h, z18.h\n"
+    "fmla z24.h, p3/M, z8.h, z17.h\n"
+    "fmla z21.h, p3/M, z6.h, z16.h\n"
+    "fmla z28.h, p3/M, z3.h, z19.h\n"
+    "fmla z25.h, p3/M, z0.h, z19.h\n"
+    "fmla z22.h, p3/M, z5.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z23.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z18.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z19.h\n"
+    "fmla z26.h, p3/M, z1.h, z19.h\n"
+    "fmla z28.h, p3/M, z5.h, z17.h\n"
+    "ld1h { z16.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z4.h, z17.h\n"
+    "fmla z25.h, p3/M, z2.h, z17.h\n"
+    "fmla z24.h, p3/M, z1.h, z17.h\n"
+    "fmla z21.h, p3/M, z8.h, z18.h\n"
+    "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x17, #0x20]\n"
+    "fmla z22.h, p3/M, z7.h, z18.h\n"
+    "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z17.h\n"
+    "fmla z26.h, p3/M, z7.h, z16.h\n"
+    "fmla z25.h, p3/M, z6.h, z16.h\n"
+    "fmla z23.h, p3/M, z4.h, z16.h\n"
+    "fmla z21.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "fmla z22.h, p3/M, z4.h, z18.h\n"
+    "fmla z28.h, p3/M, z1.h, z17.h\n"
+    "fmax z28.h, p3/M, z28.h, z31.h\n"
+    "fmin z28.h, p3/M, z28.h, z30.h\n"
+    "fmla z27.h, p3/M, z0.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z6.h, z16.h\n"
+    "fmax z29.h, p3/M, z29.h, z31.h\n"
+    "fmla z24.h, p3/M, z7.h, z18.h\n"
+    "fmla z21.h, p3/M, z5.h, z18.h\n"
+    "fmin z29.h, p3/M, z29.h, z30.h\n"
+    "st1h { z29.h }, p1, [x10, x14, LSL #1]\n"
+    "fmla z23.h, p3/M, z0.h, z16.h\n"
+    "fmla z22.h, p3/M, z2.h, z17.h\n"
+    "ldr x24, [x13, #0x20]\n"
+    "st1h { z28.h }, p1, [x9, x14, LSL #1]\n"
+    "fmla z25.h, p3/M, z8.h, z18.h\n"
+    "fmla z26.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ldp x23, x22, [x17, #0x0]\n"
+    "fmla z27.h, p3/M, z8.h, z17.h\n"
+    "fmla z24.h, p3/M, z5.h, z17.h\n"
+    "ldp x21, x20, [x17, #0x10]\n"
+    "fmax z27.h, p3/M, z27.h, z31.h\n"
+    "fmla z23.h, p3/M, z8.h, z16.h\n"
+    "fmla z21.h, p3/M, z7.h, z16.h\n"
+    "fmax z26.h, p3/M, z26.h, z31.h\n"
+    "fmax z25.h, p3/M, z25.h, z31.h\n"
+    "fmla z22.h, p3/M, z6.h, z16.h\n"
+    "inch x15\n"
+    "ld1h { z9.h }, p0/Z, [x23, x16, LSL #1]\n"
+    "ld1h { z10.h }, p0/Z, [x22, x16, LSL #1]\n"
+    "ld1h { z11.h }, p0/Z, [x21, x16, LSL #1]\n"
+    "ld1h { z12.h }, p0/Z, [x20, x16, LSL #1]\n"
+    "fmin z27.h, p3/M, z27.h, z30.h\n"
+    "fmin z26.h, p3/M, z26.h, z30.h\n"
+    "ld1h { z13.h }, p0/Z, [x25, x16, LSL #1]\n"
+    "inch x16\n"
+    "fmin z25.h, p3/M, z25.h, z30.h\n"
+    "st1h { z27.h }, p1, [x28, x14, LSL #1]\n"
+    "fmax z24.h, p3/M, z24.h, z31.h\n"
+    "fmax z23.h, p3/M, z23.h, z31.h\n"
+    "st1h { z26.h }, p1, [x27, x14, LSL #1]\n"
+    "ldr x23, [x13, #0x28]\n"
+    "fmax z21.h, p3/M, z21.h, z31.h\n"
+    "fmax z22.h, p3/M, z22.h, z31.h\n"
+    "st1h { z25.h }, p1, [x24, x14, LSL #1]\n"
+    "ldr x22, [x13, #0x30]\n"
+    "ldr x21, [x13, #0x38]\n"
+    "ldr x20, [x13, #0x40]\n"
+    "whilelt p2.h, x15, %x[n_channels]\n"
+    "cmp x16, %x[n_channels]\n"
+    "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
+    "fmin z24.h, p3/M, z24.h, z30.h\n"
+    "fmin z23.h, p3/M, z23.h, z30.h\n"
+    "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
+    "fmin z21.h, p3/M, z21.h, z30.h\n"
+    "fmin z22.h, p3/M, z22.h, z30.h\n"
+    "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n"
+    "st1h { z24.h }, p1, [x23, x14, LSL #1]\n"
+    "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n"
+    "addvl x8, x8, #16\n"
+    "st1h { z23.h }, p1, [x22, x14, LSL #1]\n"
+    "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n"
+    "st1h { z21.h }, p1, [x21, x14, LSL #1]\n"
+    "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n"
+    "addvl x8, x8, #-6\n"
+    "st1h { z22.h }, p1, [x20, x14, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z29, z14\n fmla z29.h, p3/M, z8.h, z9.h\n"
+    "movprfx z28, z14\n fmla z28.h, p3/M, z7.h, z9.h\n"
+    "ldr x23, [x17, #0x30]\n"
+    "ldr x26, [x17, #0x38]\n"
+    "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+    "fmla z29.h, p3/M, z0.h, z10.h\n"
+    "ldr x22, [x17, #0x28]\n"
+    "ldr x21, [x17, #0x48]\n"
+    "fmla z28.h, p3/M, z4.h, z13.h\n"
+    "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+    "ldr x20, [x17, #0x40]\n"
+    "ld1h { z19.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+    "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+    "ldr x25, [x17, #0x50]\n"
+    "ldr x24, [x17, #0x58]\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "movprfx z23, z14\n fmla z23.h, p3/M, z2.h, z9.h\n"
+    "ldr x23, [x17, #0x60]\n"
+    "fmla z29.h, p3/M, z5.h, z13.h\n"
+    "fmla z28.h, p3/M, z6.h, z18.h\n"
+    "ldr x12, [x17, #0x70]\n"
+    "ldr x11, [x17, #0x88]\n"
+    "movprfx z22, z14\n fmla z22.h, p3/M, z0.h, z9.h\n"
+    "fmla z27.h, p3/M, z3.h, z13.h\n"
+    "inch x14\n"
+    "mov p0.b, p2.b\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "fmla z25.h, p3/M, z1.h, z13.h\n"
+    "ldr x10, [x13, #0x0]\n"
+    "ldr x9, [x13, #0x8]\n"
+    "fmla z24.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z17.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z23.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "movprfx z21, z14\n fmla z21.h, p3/M, z1.h, z9.h\n"
+    "fmla z29.h, p3/M, z7.h, z18.h\n"
+    "ldr x22, [x17, #0x68]\n"
+    "ldr x21, [x17, #0x78]\n"
+    "fmla z28.h, p3/M, z0.h, z17.h\n"
+    "fmla z22.h, p3/M, z8.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "ldr x20, [x17, #0x80]\n"
+    "fmla z26.h, p3/M, z4.h, z18.h\n"
+    "fmla z25.h, p3/M, z3.h, z18.h\n"
+    "ldr x28, [x13, #0x10]\n"
+    "ldr x27, [x13, #0x18]\n"
+    "fmla z21.h, p3/M, z0.h, z18.h\n"
+    "fmla z24.h, p3/M, z4.h, z19.h\n"
+    "fmla z23.h, p3/M, z1.h, z18.h\n"
+    "fmla z29.h, p3/M, z1.h, z17.h\n"
+    "ld1h { z20.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ld1h { z17.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z2.h, z16.h\n"
+    "fmla z27.h, p3/M, z1.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "ldr x26, [x17, #0x90]\n"
+    "fmla z25.h, p3/M, z5.h, z19.h\n"
+    "fmla z21.h, p3/M, z2.h, z19.h\n"
+    "ldr x25, [x17, #0xa0]\n"
+    "ldr x24, [x17, #0x98]\n"
+    "fmla z26.h, p3/M, z0.h, z20.h\n"
+    "fmla z24.h, p3/M, z2.h, z17.h\n"
+    "fmla z28.h, p3/M, z8.h, z19.h\n"
+    "fmla z27.h, p3/M, z7.h, z19.h\n"
+    "fmla z22.h, p3/M, z1.h, z19.h\n"
+    "fmla z23.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z18.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ldr x23, [x17, #0xa8]\n"
+    "fmla z26.h, p3/M, z6.h, z16.h\n"
+    "fmla z25.h, p3/M, z7.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "ldr x22, [x17, #0xc0]\n"
+    "fmla z24.h, p3/M, z6.h, z18.h\n"
+    "fmla z21.h, p3/M, z4.h, z18.h\n"
+    "fmla z29.h, p3/M, z3.h, z20.h\n"
+    "fmla z27.h, p3/M, z5.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "fmla z23.h, p3/M, z5.h, z18.h\n"
+    "fmla z22.h, p3/M, z3.h, z18.h\n"
+    "ldr x21, [x17, #0xb0]\n"
+    "ldr x20, [x17, #0xb8]\n"
+    "fmla z26.h, p3/M, z8.h, z18.h\n"
+    "fmla z24.h, p3/M, z8.h, z17.h\n"
+    "fmla z21.h, p3/M, z6.h, z16.h\n"
+    "fmla z28.h, p3/M, z3.h, z19.h\n"
+    "fmla z25.h, p3/M, z0.h, z19.h\n"
+    "fmla z22.h, p3/M, z5.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z23.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z18.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z19.h\n"
+    "fmla z26.h, p3/M, z1.h, z19.h\n"
+    "fmla z28.h, p3/M, z5.h, z17.h\n"
+    "ld1h { z16.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z4.h, z17.h\n"
+    "fmla z25.h, p3/M, z2.h, z17.h\n"
+    "fmla z24.h, p3/M, z1.h, z17.h\n"
+    "fmla z21.h, p3/M, z8.h, z18.h\n"
+    "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z22.h, p3/M, z7.h, z18.h\n"
+    "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z17.h\n"
+    "fmla z26.h, p3/M, z7.h, z16.h\n"
+    "fmla z25.h, p3/M, z6.h, z16.h\n"
+    "fmla z23.h, p3/M, z4.h, z16.h\n"
+    "fmla z21.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "fmla z22.h, p3/M, z4.h, z18.h\n"
+    "fmla z28.h, p3/M, z1.h, z17.h\n"
+    "fmax z28.h, p3/M, z28.h, z31.h\n"
+    "fmin z28.h, p3/M, z28.h, z30.h\n"
+    "fmla z27.h, p3/M, z0.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z6.h, z16.h\n"
+    "fmax z29.h, p3/M, z29.h, z31.h\n"
+    "fmla z24.h, p3/M, z7.h, z18.h\n"
+    "fmla z21.h, p3/M, z5.h, z18.h\n"
+    "fmin z29.h, p3/M, z29.h, z30.h\n"
+    "st1h { z29.h }, p0, [x10, x14, LSL #1]\n"
+    "fmla z23.h, p3/M, z0.h, z16.h\n"
+    "fmla z22.h, p3/M, z2.h, z17.h\n"
+    "ldr x20, [x13, #0x20]\n"
+    "st1h { z28.h }, p0, [x9, x14, LSL #1]\n"
+    "fmla z25.h, p3/M, z8.h, z18.h\n"
+    "fmla z26.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "fmax z26.h, p3/M, z26.h, z31.h\n"
+    "fmla z27.h, p3/M, z8.h, z17.h\n"
+    "fmla z24.h, p3/M, z5.h, z17.h\n"
+    "fmax z27.h, p3/M, z27.h, z31.h\n"
+    "fmax z25.h, p3/M, z25.h, z31.h\n"
+    "fmla z23.h, p3/M, z8.h, z16.h\n"
+    "fmla z21.h, p3/M, z7.h, z16.h\n"
+    "fmin z27.h, p3/M, z27.h, z30.h\n"
+    "fmin z26.h, p3/M, z26.h, z30.h\n"
+    "fmla z22.h, p3/M, z6.h, z16.h\n"
+    "fmin z25.h, p3/M, z25.h, z30.h\n"
+    "fmax z24.h, p3/M, z24.h, z31.h\n"
+    "st1h { z27.h }, p0, [x28, x14, LSL #1]\n"
+    "fmax z23.h, p3/M, z23.h, z31.h\n"
+    "fmax z21.h, p3/M, z21.h, z31.h\n"
+    "st1h { z26.h }, p0, [x27, x14, LSL #1]\n"
+    "ldr x23, [x13, #0x28]\n"
+    "fmax z22.h, p3/M, z22.h, z31.h\n"
+    "st1h { z25.h }, p0, [x20, x14, LSL #1]\n"
+    "ldr x22, [x13, #0x30]\n"
+    "ldr x21, [x13, #0x38]\n"
+    "ldr x20, [x13, #0x40]\n"
+    "fmin z24.h, p3/M, z24.h, z30.h\n"
+    "fmin z23.h, p3/M, z23.h, z30.h\n"
+    "st1h { z24.h }, p0, [x23, x14, LSL #1]\n"
+    "fmin z21.h, p3/M, z21.h, z30.h\n"
+    "fmin z22.h, p3/M, z22.h, z30.h\n"
+    "st1h { z23.h }, p0, [x22, x14, LSL #1]\n"
+    "st1h { z21.h }, p0, [x21, x14, LSL #1]\n"
+    "st1h { z22.h }, p0, [x20, x14, LSL #1]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..af5ee740c9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 4;
+
+  sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..41eaa4f18c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,656 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x16, #0x0\n"
+    "mov x4, #0x0\n"
+    "1:"  // Tile loop
+    "str x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x4\n"
+    "mov x24, #0x4\n"
+    "str x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x16, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x5, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x16, x22\n"  // offset = tile_i * ld_output_row
+    "add x7, x5, x5\n"
+    "madd x21, x4, x5, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    "cnth x16\n"
+    "madd x20, x4, x6, x20\n"  // offset += tile_j * ld_output_col
+    "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x14, x7, x5\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "mul x21, x21, x25\n"  // offset *= kernel_stride * output_size
+    "add x8, x8, x21, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x13, x8, x23, LSL #1\n"
+    "ld1h { z19.h }, p3/Z, [x17]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "add x12, x13, x23, LSL #1\n"
+    "add x15, x15, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+    "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
+    "add x11, x12, x23, LSL #1\n"
+    "add x10, x14, x5\n"
+    "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
+    "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
+    "add x9, x15, x22, LSL #1\n"
+    "add x28, x11, x23, LSL #1\n"
+    "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
+    "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
+    "addvl x17, x17, #16\n"
+    "add x27, x10, x5\n"
+    "add x26, x9, x22, LSL #1\n"
+    "add x25, x6, x6\n"
+    "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "cmp x16, %x[n_channels]\n"
+    "add x24, x28, x23, LSL #1\n"
+    "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
+    "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
+    "add x23, x26, x22, LSL #1\n"
+    "add x22, x25, x6\n"
+    "ld1h { z9.h }, p2/Z, [x12, x7, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x8]\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x16\n"
+    "ld1h { z11.h }, p2/Z, [x8, x27, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x12, x14, LSL #1]\n"
+    "addvl x17, x17, #-6\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z14, z19\n fmla z14.h, p3/M, z4.h, z9.h\n"
+    "movprfx z31, z19\n fmla z31.h, p3/M, z8.h, z9.h\n"
+    "whilelt p1.h, x16, %x[n_channels]\n"
+    "inch x21\n"
+    "movprfx z21, z19\n fmla z21.h, p3/M, z3.h, z9.h\n"
+    "movprfx z22, z19\n fmla z22.h, p3/M, z1.h, z9.h\n"
+    "inch x16\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z20, z19\n fmla z20.h, p3/M, z0.h, z9.h\n"
+    "fmla z14.h, p3/M, z5.h, z12.h\n"
+    "inch x20\n"
+    "movprfx z13, z19\n fmla z13.h, p3/M, z7.h, z9.h\n"
+    "movprfx z17, z19\n fmla z17.h, p3/M, z6.h, z9.h\n"
+    "movprfx z27, z19\n fmla z27.h, p3/M, z5.h, z9.h\n"
+    "movprfx z18, z19\n fmla z18.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z29.h }, p2/Z, [x24]\n"
+    "ld1h { z11.h }, p2/Z, [x24, x27, LSL #1]\n"
+    "fmla z21.h, p3/M, z4.h, z12.h\n"
+    "fmla z22.h, p3/M, z2.h, z12.h\n"
+    "fmla z20.h, p3/M, z1.h, z12.h\n"
+    "movprfx z23, z19\n fmla z23.h, p3/M, z6.h, z29.h\n"
+    "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
+    "fmla z14.h, p3/M, z7.h, z9.h\n"
+    "fmla z13.h, p3/M, z8.h, z12.h\n"
+    "fmla z17.h, p3/M, z7.h, z12.h\n"
+    "fmla z30.h, p3/M, z6.h, z12.h\n"
+    "movprfx z26, z19\n fmla z26.h, p3/M, z3.h, z12.h\n"
+    "movprfx z28, z19\n fmla z28.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x8, x5, LSL #1]\n"
+    "movprfx z24, z19\n fmla z24.h, p3/M, z8.h, z11.h\n"
+    "fmla z21.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z11.h }, p2/Z, [x8, x10, LSL #1]\n"
+    "fmla z22.h, p3/M, z4.h, z9.h\n"
+    "fmla z20.h, p3/M, z3.h, z9.h\n"
+    "movprfx z25, z19\n fmla z25.h, p3/M, z1.h, z9.h\n"
+    "movprfx z29, z19\n fmla z29.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z19.h }, p3/Z, [x17]\n"
+    "fmla z27.h, p3/M, z8.h, z9.h\n"
+    "fmla z18.h, p3/M, z5.h, z9.h\n"
+    "fmla z23.h, p3/M, z2.h, z9.h\n"
+    "fmla z14.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z9.h }, p2/Z, [x13]\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z13.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n"
+    "fmla z17.h, p3/M, z2.h, z11.h\n"
+    "fmla z30.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x28]\n"
+    "fmla z21.h, p3/M, z7.h, z10.h\n"
+    "fmla z26.h, p3/M, z6.h, z10.h\n"
+    "fmla z22.h, p3/M, z5.h, z10.h\n"
+    "fmla z20.h, p3/M, z4.h, z10.h\n"
+    "fmla z28.h, p3/M, z3.h, z10.h\n"
+    "fmla z25.h, p3/M, z2.h, z10.h\n"
+    "fmla z29.h, p3/M, z1.h, z10.h\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x7, LSL #1]\n"
+    "fmla z27.h, p3/M, z0.h, z9.h\n"
+    "fmla z18.h, p3/M, z6.h, z11.h\n"
+    "fmla z23.h, p3/M, z3.h, z11.h\n"
+    "fmla z14.h, p3/M, z1.h, z10.h\n"
+    "ld1h { z11.h }, p2/Z, [x28, x27, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z9.h\n"
+    "fmla z30.h, p3/M, z5.h, z12.h\n"
+    "fmla z26.h, p3/M, z2.h, z12.h\n"
+    "fmla z13.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z9.h }, p2/Z, [x13, x14, LSL #1]\n"
+    "fmla z17.h, p3/M, z3.h, z10.h\n"
+    "fmla z21.h, p3/M, z0.h, z10.h\n"
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "fmla z24.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x5, LSL #1]\n"
+    "fmla z27.h, p3/M, z2.h, z10.h\n"
+    "fmla z14.h, p3/M, z2.h, z9.h\n"
+    "fmla z31.h, p3/M, z5.h, z10.h\n"
+    "fmla z13.h, p3/M, z5.h, z9.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x5, LSL #1]\n"
+    "fmla z17.h, p3/M, z4.h, z9.h\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "fmla z21.h, p3/M, z1.h, z9.h\n"
+    "fmla z26.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z10.h }, p2/Z, [x12, x10, LSL #1]\n"
+    "fmla z23.h, p3/M, z7.h, z12.h\n"
+    "fmla z25.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x10, LSL #1]\n"
+    "fmla z27.h, p3/M, z4.h, z11.h\n"
+    "fmla z14.h, p3/M, z3.h, z11.h\n"
+    "fmla z18.h, p3/M, z1.h, z11.h\n"
+    "fmla z22.h, p3/M, z0.h, z11.h\n"
+    "fmla z31.h, p3/M, z7.h, z11.h\n"
+    "fmla z13.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z9.h }, p2/Z, [x8, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "fmla z24.h, p3/M, z7.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n"
+    "fmla z17.h, p3/M, z8.h, z10.h\n"
+    "fmla z30.h, p3/M, z7.h, z10.h\n"
+    "fmla z21.h, p3/M, z5.h, z10.h\n"
+    "fmla z26.h, p3/M, z4.h, z10.h\n"
+    "fmla z20.h, p3/M, z2.h, z10.h\n"
+    "fmla z28.h, p3/M, z1.h, z10.h\n"
+    "ld1h { z11.h }, p2/Z, [x8, x14, LSL #1]\n"
+    "addvl x8, x8, #1\n"
+    "fmla z27.h, p3/M, z7.h, z12.h\n"
+    "fmla z14.h, p3/M, z6.h, z12.h\n"
+    "fmla z18.h, p3/M, z4.h, z12.h\n"
+    "fmla z22.h, p3/M, z3.h, z12.h\n"
+    "fmla z23.h, p3/M, z1.h, z12.h\n"
+    "fmla z25.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x10, LSL #1]\n"
+    "fmla z31.h, p3/M, z2.h, z9.h\n"
+    "fmla z13.h, p3/M, z1.h, z9.h\n"
+    "fmla z17.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x12]\n"
+    "fmla z29.h, p3/M, z2.h, z12.h\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z27.h, p3/M, z3.h, z9.h\n"
+    "fmla z18.h, p3/M, z0.h, z9.h\n"
+    "fmla z21.h, p3/M, z8.h, z12.h\n"
+    "fmla z26.h, p3/M, z7.h, z12.h\n"
+    "fmla z20.h, p3/M, z5.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    "fmla z24.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z10.h }, p2/Z, [x28, x7, LSL #1]\n"
+    "fmla z13.h, p3/M, z2.h, z11.h\n"
+    "fmla z17.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x27, LSL #1]\n"
+    "addvl x12, x12, #1\n"
+    "fmla z31.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z12.h }, p2/Z, [x11]\n"
+    "fmla z25.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z9.h }, p1/Z, [x12, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z10.h\n"
+    "fmla z30.h, p3/M, z8.h, z11.h\n"
+    "fmla z26.h, p3/M, z5.h, z11.h\n"
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x27, LSL #1]\n"
+    "addvl x11, x11, #1\n"
+    "fmla z27.h, p3/M, z6.h, z12.h\n"
+    "fmla z18.h, p3/M, z3.h, z12.h\n"
+    "fmla z23.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x7, LSL #1]\n"
+    "fmla z24.h, p3/M, z2.h, z11.h\n"
+    "fmla z25.h, p3/M, z7.h, z12.h\n"
+    "fmla z29.h, p3/M, z6.h, z12.h\n"
+    "fmla z18.h, p3/M, z8.h, z10.h\n"
+    "fmla z22.h, p3/M, z7.h, z10.h\n"
+    "fmla z20.h, p3/M, z6.h, z10.h\n"
+    "fmla z23.h, p3/M, z5.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x28, x14, LSL #1]\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "fmla z25.h, p3/M, z5.h, z10.h\n"
+    "fmla z29.h, p3/M, z4.h, z10.h\n"
+    "fmla z24.h, p3/M, z3.h, z10.h\n"
+    "fmla z26.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n"
+    "fmla z23.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x5, LSL #1]\n"
+    "fmla z22.h, p3/M, z8.h, z10.h\n"
+    "fmla z20.h, p3/M, z7.h, z10.h\n"
+    "addvl x24, x24, #1\n"
+    "fmla z28.h, p3/M, z6.h, z10.h\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x10, LSL #1]\n"
+    "addvl x13, x13, #1\n"
+    "fmla z29.h, p3/M, z7.h, z11.h\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x28, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z12.h\n"
+    "fmla z13.h, p3/M, z3.h, z12.h\n"
+    "fmax z31.h, p3/M, z31.h, z15.h\n"
+    "fmax z13.h, p3/M, z13.h, z15.h\n"
+    "fmla z27.h, p3/M, z1.h, z12.h\n"
+    "fmla z14.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z0.h }, p2/Z, [x28, x10, LSL #1]\n"
+    "fmax z27.h, p3/M, z27.h, z15.h\n"
+    "fmla z17.h, p3/M, z5.h, z10.h\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "fmax z17.h, p3/M, z17.h, z15.h\n"
+    "fmax z30.h, p3/M, z30.h, z15.h\n"
+    "fmla z21.h, p3/M, z2.h, z10.h\n"
+    "fmla z26.h, p3/M, z1.h, z10.h\n"
+    "fmax z14.h, p3/M, z14.h, z15.h\n"
+    "fmax z21.h, p3/M, z21.h, z15.h\n"
+    "fmla z18.h, p3/M, z7.h, z11.h\n"
+    "fmla z22.h, p3/M, z6.h, z11.h\n"
+    "fmax z26.h, p3/M, z26.h, z15.h\n"
+    "fmax z18.h, p3/M, z18.h, z15.h\n"
+    "fmla z23.h, p3/M, z4.h, z11.h\n"
+    "fmla z25.h, p3/M, z3.h, z11.h\n"
+    "fmax z22.h, p3/M, z22.h, z15.h\n"
+    "fmax z23.h, p3/M, z23.h, z15.h\n"
+    "fmla z20.h, p3/M, z8.h, z0.h\n"
+    "fmla z28.h, p3/M, z7.h, z0.h\n"
+    "fmax z20.h, p3/M, z20.h, z15.h\n"
+    "fmax z28.h, p3/M, z28.h, z15.h\n"
+    "fmla z29.h, p3/M, z5.h, z0.h\n"
+    "fmla z24.h, p3/M, z4.h, z0.h\n"
+    "fmax z25.h, p3/M, z25.h, z15.h\n"
+    "fmax z29.h, p3/M, z29.h, z15.h\n"
+    "fmax z24.h, p3/M, z24.h, z15.h\n"
+    "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
+    "cmp x16, %x[n_channels]\n"
+    "fmin z31.h, p3/M, z31.h, z16.h\n"
+    "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
+    "fmin z13.h, p3/M, z13.h, z16.h\n"
+    "fmin z17.h, p3/M, z17.h, z16.h\n"
+    "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
+    "addvl x17, x17, #16\n"
+    "fmin z30.h, p3/M, z30.h, z16.h\n"
+    "ld1h { z10.h }, p1/Z, [x8]\n"
+    "fmin z27.h, p3/M, z27.h, z16.h\n"
+    "fmin z14.h, p3/M, z14.h, z16.h\n"
+    "ld1h { z11.h }, p1/Z, [x8, x27, LSL #1]\n"
+    "ld1h { z12.h }, p1/Z, [x12, x14, LSL #1]\n"
+    "fmin z21.h, p3/M, z21.h, z16.h\n"
+    "fmin z26.h, p3/M, z26.h, z16.h\n"
+    "st1h { z31.h }, p0, [x15]\n"
+    "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
+    "fmin z18.h, p3/M, z18.h, z16.h\n"
+    "fmin z22.h, p3/M, z22.h, z16.h\n"
+    "st1h { z13.h }, p0, [x15, x6, LSL #1]\n"
+    "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
+    "fmin z20.h, p3/M, z20.h, z16.h\n"
+    "fmin z28.h, p3/M, z28.h, z16.h\n"
+    "st1h { z17.h }, p0, [x15, x25, LSL #1]\n"
+    "fmin z23.h, p3/M, z23.h, z16.h\n"
+    "fmin z25.h, p3/M, z25.h, z16.h\n"
+    "st1h { z30.h }, p0, [x15, x22, LSL #1]\n"
+    "fmin z29.h, p3/M, z29.h, z16.h\n"
+    "fmin z24.h, p3/M, z24.h, z16.h\n"
+    "st1h { z27.h }, p0, [x9]\n"
+    "addvl x28, x28, #1\n"
+    "st1h { z14.h }, p0, [x9, x6, LSL #1]\n"
+    "addvl x15, x15, #1\n"
+    "st1h { z21.h }, p0, [x9, x25, LSL #1]\n"
+    "addvl x17, x17, #-6\n"
+    "st1h { z26.h }, p0, [x9, x22, LSL #1]\n"
+    "addvl x9, x9, #1\n"
+    "st1h { z18.h }, p0, [x26]\n"
+    "st1h { z22.h }, p0, [x26, x6, LSL #1]\n"
+    "st1h { z20.h }, p0, [x26, x25, LSL #1]\n"
+    "st1h { z28.h }, p0, [x26, x22, LSL #1]\n"
+    "addvl x26, x26, #1\n"
+    "st1h { z23.h }, p0, [x23]\n"
+    "st1h { z25.h }, p0, [x23, x6, LSL #1]\n"
+    "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+    "st1h { z24.h }, p0, [x23, x22, LSL #1]\n"
+    "addvl x23, x23, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z14, z19\n fmla z14.h, p3/M, z4.h, z9.h\n"
+    "movprfx z31, z19\n fmla z31.h, p3/M, z8.h, z9.h\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "movprfx z30, z19\n fmla z30.h, p3/M, z3.h, z9.h\n"
+    "movprfx z13, z19\n fmla z13.h, p3/M, z1.h, z9.h\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "add x4, x4, #0x1\n"
+    "movprfx z20, z19\n fmla z20.h, p3/M, z0.h, z9.h\n"
+    "fmla z14.h, p3/M, z5.h, z12.h\n"
+    "cmp x4, x20\n"
+    "add x21, x16, #0x1\n"
+    "movprfx z18, z19\n fmla z18.h, p3/M, z7.h, z9.h\n"
+    "movprfx z28, z19\n fmla z28.h, p3/M, z6.h, z9.h\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x16, x16, x21, LT\n"
+    "movprfx z17, z19\n fmla z17.h, p3/M, z5.h, z9.h\n"
+    "movprfx z26, z19\n fmla z26.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "movprfx z27, z19\n fmla z27.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z29.h }, p2/Z, [x24]\n"
+    "ld1h { z21.h }, p2/Z, [x24, x27, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z12.h\n"
+    "fmla z13.h, p3/M, z2.h, z12.h\n"
+    "csel x4, x4, XZR, LT\n"
+    "cmp x16, x20\n"
+    "fmla z20.h, p3/M, z1.h, z12.h\n"
+    "movprfx z10, z19\n fmla z10.h, p3/M, z6.h, z29.h\n"
+    "ld1h { z29.h }, p2/Z, [x11, x14, LSL #1]\n"
+    "fmla z14.h, p3/M, z7.h, z9.h\n"
+    "fmla z18.h, p3/M, z8.h, z12.h\n"
+    "fmla z28.h, p3/M, z7.h, z12.h\n"
+    "fmla z27.h, p3/M, z6.h, z12.h\n"
+    "movprfx z11, z19\n fmla z11.h, p3/M, z3.h, z12.h\n"
+    "movprfx z25, z19\n fmla z25.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z22.h }, p2/Z, [x8, x5, LSL #1]\n"
+    "movprfx z24, z19\n fmla z24.h, p3/M, z8.h, z21.h\n"
+    "fmla z30.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z21.h }, p2/Z, [x8, x10, LSL #1]\n"
+    "fmla z13.h, p3/M, z4.h, z9.h\n"
+    "fmla z20.h, p3/M, z3.h, z9.h\n"
+    "movprfx z12, z19\n fmla z12.h, p3/M, z1.h, z9.h\n"
+    "movprfx z23, z19\n fmla z23.h, p3/M, z0.h, z9.h\n"
+    "fmla z17.h, p3/M, z8.h, z9.h\n"
+    "fmla z26.h, p3/M, z5.h, z9.h\n"
+    "fmla z10.h, p3/M, z2.h, z9.h\n"
+    "fmla z14.h, p3/M, z8.h, z29.h\n"
+    "ld1h { z9.h }, p2/Z, [x13]\n"
+    "fmla z31.h, p3/M, z1.h, z22.h\n"
+    "fmla z18.h, p3/M, z0.h, z22.h\n"
+    "ld1h { z22.h }, p2/Z, [x13, x27, LSL #1]\n"
+    "fmla z28.h, p3/M, z2.h, z21.h\n"
+    "fmla z27.h, p3/M, z1.h, z21.h\n"
+    "ld1h { z19.h }, p2/Z, [x28]\n"
+    "fmla z30.h, p3/M, z7.h, z29.h\n"
+    "fmla z11.h, p3/M, z6.h, z29.h\n"
+    "fmla z13.h, p3/M, z5.h, z29.h\n"
+    "fmla z20.h, p3/M, z4.h, z29.h\n"
+    "fmla z25.h, p3/M, z3.h, z29.h\n"
+    "fmla z12.h, p3/M, z2.h, z29.h\n"
+    "fmla z23.h, p3/M, z1.h, z29.h\n"
+    "fmla z24.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z21.h }, p2/Z, [x13, x7, LSL #1]\n"
+    "fmla z17.h, p3/M, z0.h, z9.h\n"
+    "fmla z26.h, p3/M, z6.h, z19.h\n"
+    "fmla z10.h, p3/M, z3.h, z19.h\n"
+    "fmla z14.h, p3/M, z1.h, z21.h\n"
+    "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z9.h\n"
+    "fmla z27.h, p3/M, z5.h, z22.h\n"
+    "fmla z11.h, p3/M, z2.h, z22.h\n"
+    "fmla z18.h, p3/M, z4.h, z21.h\n"
+    "ld1h { z29.h }, p2/Z, [x13, x14, LSL #1]\n"
+    "fmla z28.h, p3/M, z3.h, z21.h\n"
+    "fmla z30.h, p3/M, z0.h, z21.h\n"
+    "fmla z25.h, p3/M, z8.h, z19.h\n"
+    "fmla z24.h, p3/M, z5.h, z19.h\n"
+    "ld1h { z19.h }, p2/Z, [x24, x5, LSL #1]\n"
+    "fmla z17.h, p3/M, z2.h, z21.h\n"
+    "fmla z14.h, p3/M, z2.h, z29.h\n"
+    "fmla z31.h, p3/M, z5.h, z21.h\n"
+    "fmla z18.h, p3/M, z5.h, z29.h\n"
+    "ld1h { z22.h }, p2/Z, [x12, x5, LSL #1]\n"
+    "fmla z28.h, p3/M, z4.h, z29.h\n"
+    "fmla z27.h, p3/M, z3.h, z29.h\n"
+    "fmla z30.h, p3/M, z1.h, z29.h\n"
+    "fmla z11.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z21.h }, p2/Z, [x12, x10, LSL #1]\n"
+    "fmla z10.h, p3/M, z7.h, z19.h\n"
+    "fmla z12.h, p3/M, z6.h, z19.h\n"
+    "ld1h { z19.h }, p2/Z, [x24, x10, LSL #1]\n"
+    "fmla z17.h, p3/M, z4.h, z22.h\n"
+    "fmla z14.h, p3/M, z3.h, z22.h\n"
+    "fmla z26.h, p3/M, z1.h, z22.h\n"
+    "fmla z13.h, p3/M, z0.h, z22.h\n"
+    "fmla z31.h, p3/M, z7.h, z22.h\n"
+    "fmla z18.h, p3/M, z6.h, z22.h\n"
+    "ld1h { z29.h }, p2/Z, [x8, x7, LSL #1]\n"
+    "fmla z23.h, p3/M, z8.h, z19.h\n"
+    "fmla z24.h, p3/M, z7.h, z19.h\n"
+    "ld1h { z19.h }, p2/Z, [x11, x5, LSL #1]\n"
+    "fmla z28.h, p3/M, z8.h, z21.h\n"
+    "fmla z27.h, p3/M, z7.h, z21.h\n"
+    "fmla z30.h, p3/M, z5.h, z21.h\n"
+    "fmla z11.h, p3/M, z4.h, z21.h\n"
+    "fmla z20.h, p3/M, z2.h, z21.h\n"
+    "fmla z25.h, p3/M, z1.h, z21.h\n"
+    "ld1h { z22.h }, p2/Z, [x8, x14, LSL #1]\n"
+    "fmla z17.h, p3/M, z7.h, z19.h\n"
+    "fmla z14.h, p3/M, z6.h, z19.h\n"
+    "fmla z26.h, p3/M, z4.h, z19.h\n"
+    "fmla z13.h, p3/M, z3.h, z19.h\n"
+    "fmla z10.h, p3/M, z1.h, z19.h\n"
+    "fmla z12.h, p3/M, z0.h, z19.h\n"
+    "ld1h { z21.h }, p2/Z, [x11, x10, LSL #1]\n"
+    "fmla z31.h, p3/M, z2.h, z29.h\n"
+    "fmla z18.h, p3/M, z1.h, z29.h\n"
+    "fmla z28.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z29.h }, p2/Z, [x12]\n"
+    "fmla z23.h, p3/M, z2.h, z21.h\n"
+    "fmla z27.h, p3/M, z0.h, z22.h\n"
+    "fmla z17.h, p3/M, z3.h, z29.h\n"
+    "fmla z26.h, p3/M, z0.h, z29.h\n"
+    "fmla z30.h, p3/M, z8.h, z21.h\n"
+    "fmla z11.h, p3/M, z7.h, z21.h\n"
+    "fmla z20.h, p3/M, z5.h, z21.h\n"
+    "fmla z25.h, p3/M, z4.h, z21.h\n"
+    "fmla z24.h, p3/M, z1.h, z21.h\n"
+    "ld1h { z19.h }, p2/Z, [x28, x7, LSL #1]\n"
+    "fmla z18.h, p3/M, z2.h, z22.h\n"
+    "fmla z28.h, p3/M, z1.h, z22.h\n"
+    "ld1h { z21.h }, p2/Z, [x12, x27, LSL #1]\n"
+    "fmla z31.h, p3/M, z6.h, z29.h\n"
+    "ld1h { z29.h }, p2/Z, [x11]\n"
+    "fmla z12.h, p3/M, z4.h, z19.h\n"
+    "fmla z23.h, p3/M, z3.h, z19.h\n"
+    "fmla z27.h, p3/M, z8.h, z21.h\n"
+    "fmla z11.h, p3/M, z5.h, z21.h\n"
+    "fmla z25.h, p3/M, z2.h, z21.h\n"
+    "ld1h { z9.h }, p2/Z, [x11, x27, LSL #1]\n"
+    "fmla z17.h, p3/M, z6.h, z29.h\n"
+    "fmla z26.h, p3/M, z3.h, z29.h\n"
+    "fmla z10.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z22.h }, p2/Z, [x24, x7, LSL #1]\n"
+    "fmla z24.h, p3/M, z2.h, z9.h\n"
+    "fmla z12.h, p3/M, z7.h, z22.h\n"
+    "fmla z23.h, p3/M, z6.h, z22.h\n"
+    "fmla z26.h, p3/M, z8.h, z19.h\n"
+    "fmla z13.h, p3/M, z7.h, z19.h\n"
+    "fmla z20.h, p3/M, z6.h, z19.h\n"
+    "fmla z10.h, p3/M, z5.h, z19.h\n"
+    "ld1h { z21.h }, p2/Z, [x28, x14, LSL #1]\n"
+    "fmla z25.h, p3/M, z5.h, z9.h\n"
+    "fmla z12.h, p3/M, z5.h, z21.h\n"
+    "fmla z23.h, p3/M, z4.h, z21.h\n"
+    "fmla z24.h, p3/M, z3.h, z21.h\n"
+    "fmla z11.h, p3/M, z8.h, z9.h\n"
+    "ld1h { z19.h }, p2/Z, [x24, x14, LSL #1]\n"
+    "fmla z10.h, p3/M, z8.h, z22.h\n"
+    "ld1h { z22.h }, p2/Z, [x13, x5, LSL #1]\n"
+    "fmla z13.h, p3/M, z8.h, z21.h\n"
+    "fmla z20.h, p3/M, z7.h, z21.h\n"
+    "fmla z25.h, p3/M, z6.h, z21.h\n"
+    "fmla z12.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z29.h }, p2/Z, [x13, x10, LSL #1]\n"
+    "fmla z23.h, p3/M, z7.h, z19.h\n"
+    "fmla z24.h, p3/M, z6.h, z19.h\n"
+    "ld1h { z21.h }, p2/Z, [x28, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z22.h\n"
+    "fmla z18.h, p3/M, z3.h, z22.h\n"
+    "fmax z31.h, p3/M, z31.h, z15.h\n"
+    "fmax z18.h, p3/M, z18.h, z15.h\n"
+    "fmla z17.h, p3/M, z1.h, z22.h\n"
+    "fmla z14.h, p3/M, z0.h, z22.h\n"
+    "ld1h { z9.h }, p2/Z, [x28, x10, LSL #1]\n"
+    "fmax z17.h, p3/M, z17.h, z15.h\n"
+    "fmla z28.h, p3/M, z5.h, z29.h\n"
+    "fmla z27.h, p3/M, z4.h, z29.h\n"
+    "fmax z28.h, p3/M, z28.h, z15.h\n"
+    "fmax z27.h, p3/M, z27.h, z15.h\n"
+    "fmla z30.h, p3/M, z2.h, z29.h\n"
+    "fmla z11.h, p3/M, z1.h, z29.h\n"
+    "fmax z14.h, p3/M, z14.h, z15.h\n"
+    "fmax z30.h, p3/M, z30.h, z15.h\n"
+    "fmla z26.h, p3/M, z7.h, z21.h\n"
+    "fmla z13.h, p3/M, z6.h, z21.h\n"
+    "fmax z11.h, p3/M, z11.h, z15.h\n"
+    "fmax z26.h, p3/M, z26.h, z15.h\n"
+    "fmla z10.h, p3/M, z4.h, z21.h\n"
+    "fmla z12.h, p3/M, z3.h, z21.h\n"
+    "fmax z13.h, p3/M, z13.h, z15.h\n"
+    "fmax z10.h, p3/M, z10.h, z15.h\n"
+    "fmla z20.h, p3/M, z8.h, z9.h\n"
+    "fmla z25.h, p3/M, z7.h, z9.h\n"
+    "fmax z20.h, p3/M, z20.h, z15.h\n"
+    "fmax z25.h, p3/M, z25.h, z15.h\n"
+    "fmla z23.h, p3/M, z5.h, z9.h\n"
+    "fmla z24.h, p3/M, z4.h, z9.h\n"
+    "fmax z12.h, p3/M, z12.h, z15.h\n"
+    "fmax z23.h, p3/M, z23.h, z15.h\n"
+    "fmax z24.h, p3/M, z24.h, z15.h\n"
+    "fmin z31.h, p3/M, z31.h, z16.h\n"
+    "st1h { z31.h }, p0, [x15]\n"
+    "fmin z18.h, p3/M, z18.h, z16.h\n"
+    "fmin z28.h, p3/M, z28.h, z16.h\n"
+    "st1h { z18.h }, p0, [x15, x6, LSL #1]\n"
+    "fmin z27.h, p3/M, z27.h, z16.h\n"
+    "fmin z17.h, p3/M, z17.h, z16.h\n"
+    "st1h { z28.h }, p0, [x15, x25, LSL #1]\n"
+    "fmin z14.h, p3/M, z14.h, z16.h\n"
+    "fmin z30.h, p3/M, z30.h, z16.h\n"
+    "st1h { z27.h }, p0, [x15, x22, LSL #1]\n"
+    "fmin z11.h, p3/M, z11.h, z16.h\n"
+    "fmin z26.h, p3/M, z26.h, z16.h\n"
+    "st1h { z17.h }, p0, [x9]\n"
+    "fmin z13.h, p3/M, z13.h, z16.h\n"
+    "fmin z20.h, p3/M, z20.h, z16.h\n"
+    "st1h { z14.h }, p0, [x9, x6, LSL #1]\n"
+    "fmin z25.h, p3/M, z25.h, z16.h\n"
+    "fmin z10.h, p3/M, z10.h, z16.h\n"
+    "st1h { z30.h }, p0, [x9, x25, LSL #1]\n"
+    "fmin z12.h, p3/M, z12.h, z16.h\n"
+    "fmin z23.h, p3/M, z23.h, z16.h\n"
+    "st1h { z11.h }, p0, [x9, x22, LSL #1]\n"
+    "fmin z24.h, p3/M, z24.h, z16.h\n"
+    "st1h { z26.h }, p0, [x26]\n"
+    "st1h { z13.h }, p0, [x26, x6, LSL #1]\n"
+    "st1h { z20.h }, p0, [x26, x25, LSL #1]\n"
+    "st1h { z25.h }, p0, [x26, x22, LSL #1]\n"
+    "st1h { z10.h }, p0, [x23]\n"
+    "st1h { z12.h }, p0, [x23, x6, LSL #1]\n"
+    "st1h { z23.h }, p0, [x23, x25, LSL #1]\n"
+    "st1h { z24.h }, p0, [x23, x22, LSL #1]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..c0be293cd7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,714 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[36];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[14];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[5];
+      inptrs[3] = input_ptrs[15];
+      inptrs[4] = input_ptrs[30];
+      inptrs[5] = input_ptrs[35];
+      inptrs[6] = input_ptrs[20];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[4];
+      inptrs[9] = input_ptrs[21];
+      inptrs[10] = input_ptrs[6];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[24];
+      inptrs[13] = input_ptrs[8];
+      inptrs[14] = input_ptrs[29];
+      inptrs[15] = input_ptrs[9];
+      inptrs[16] = input_ptrs[31];
+      inptrs[17] = input_ptrs[13];
+      inptrs[18] = input_ptrs[34];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[19];
+      inptrs[22] = input_ptrs[3];
+      inptrs[23] = input_ptrs[12];
+      inptrs[24] = input_ptrs[22];
+      inptrs[25] = input_ptrs[17];
+      inptrs[26] = input_ptrs[18];
+      inptrs[27] = input_ptrs[26];
+      inptrs[28] = input_ptrs[23];
+      inptrs[29] = input_ptrs[32];
+      inptrs[30] = input_ptrs[27];
+      inptrs[31] = input_ptrs[33];
+      inptrs[32] = input_ptrs[7];
+      inptrs[33] = input_ptrs[10];
+      inptrs[34] = input_ptrs[25];
+      inptrs[35] = input_ptrs[28];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x8, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1h { z17.h }, p3/Z, [x7]\n"
+    "cnth x17\n"
+    "mov x16, #0x0\n"
+    "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n"
+    "cmp x17, %x[n_channels]\n"
+    "ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n"
+    "sub x15, XZR, x17\n"
+    "ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n"
+    "addvl x7, x7, #16\n"
+    "ldp x23, x22, [x8, #0x0]\n"
+    "ldp x21, x20, [x8, #0x10]\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n"
+    "ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n"
+    "addvl x7, x7, #-6\n"
+    "ld1h { z9.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z20, z17\n fmla z20.h, p3/M, z4.h, z9.h\n"
+    "movprfx z26, z17\n fmla z26.h, p3/M, z8.h, z9.h\n"
+    "ldr x27, [x8, #0x20]\n"
+    "ldr x24, [x8, #0x30]\n"
+    "movprfx z24, z17\n fmla z24.h, p3/M, z3.h, z9.h\n"
+    "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "ldr x23, [x8, #0x28]\n"
+    "ldr x22, [x8, #0x38]\n"
+    "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "movprfx z22, z17\n fmla z22.h, p3/M, z7.h, z9.h\n"
+    "ldr x26, [x8, #0x40]\n"
+    "ldr x21, [x8, #0x48]\n"
+    "movprfx z27, z17\n fmla z27.h, p3/M, z6.h, z9.h\n"
+    "fmla z20.h, p3/M, z5.h, z12.h\n"
+    "ldr x25, [x8, #0x50]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "movprfx z14, z17\n fmla z14.h, p3/M, z5.h, z9.h\n"
+    "movprfx z23, z17\n fmla z23.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z25.h }, p2/Z, [x24, x16, LSL #1]\n"
+    "ldr x13, [x8, #0x70]\n"
+    "fmla z26.h, p3/M, z0.h, z10.h\n"
+    "movprfx z9, z17\n fmla z9.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z28.h }, p2/Z, [x27, x16, LSL #1]\n"
+    "ld1h { z21.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "fmla z24.h, p3/M, z4.h, z12.h\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "ldr x24, [x8, #0x60]\n"
+    "ldr x23, [x8, #0x68]\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z22.h, p3/M, z8.h, z12.h\n"
+    "inch x15\n"
+    "mov p1.b, p2.b\n"
+    "fmla z27.h, p3/M, z7.h, z12.h\n"
+    "movprfx z15, z17\n fmla z15.h, p3/M, z6.h, z28.h\n"
+    "ld1h { z10.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "ldr x28, [x8, #0x88]\n"
+    "fmla z20.h, p3/M, z7.h, z25.h\n"
+    "fmla z9.h, p3/M, z6.h, z12.h\n"
+    "ldr x12, [x14, #0x0]\n"
+    "ldr x11, [x14, #0x8]\n"
+    "movprfx z11, z17\n fmla z11.h, p3/M, z3.h, z12.h\n"
+    "movprfx z13, z17\n fmla z13.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "ldr x22, [x8, #0x78]\n"
+    "movprfx z28, z17\n fmla z28.h, p3/M, z8.h, z21.h\n"
+    "fmla z24.h, p3/M, z6.h, z25.h\n"
+    "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
+    "ldr x21, [x8, #0x80]\n"
+    "fmla z30.h, p3/M, z4.h, z25.h\n"
+    "fmla z31.h, p3/M, z3.h, z25.h\n"
+    "ldr x10, [x14, #0x10]\n"
+    "ldr x9, [x14, #0x18]\n"
+    "movprfx z18, z17\n fmla z18.h, p3/M, z1.h, z25.h\n"
+    "movprfx z21, z17\n fmla z21.h, p3/M, z0.h, z25.h\n"
+    "whilelt p0.h, x17, %x[n_channels]\n"
+    "ld1h { z17.h }, p3/Z, [x7]\n"
+    "fmla z14.h, p3/M, z8.h, z25.h\n"
+    "fmla z23.h, p3/M, z5.h, z25.h\n"
+    "fmla z15.h, p3/M, z2.h, z25.h\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
+    "ldr x27, [x8, #0x90]\n"
+    "fmla z22.h, p3/M, z0.h, z12.h\n"
+    "fmla z27.h, p3/M, z2.h, z29.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "ldr x20, [x8, #0x98]\n"
+    "fmla z20.h, p3/M, z8.h, z10.h\n"
+    "fmla z9.h, p3/M, z1.h, z29.h\n"
+    "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
+    "ldr x26, [x8, #0xa0]\n"
+    "fmla z24.h, p3/M, z7.h, z10.h\n"
+    "fmla z11.h, p3/M, z6.h, z10.h\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "fmla z13.h, p3/M, z3.h, z10.h\n"
+    "fmla z18.h, p3/M, z2.h, z10.h\n"
+    "fmla z21.h, p3/M, z1.h, z10.h\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "ldr x25, [x8, #0xa8]\n"
+    "fmla z26.h, p3/M, z3.h, z25.h\n"
+    "fmla z14.h, p3/M, z0.h, z25.h\n"
+    "fmla z23.h, p3/M, z6.h, z29.h\n"
+    "fmla z15.h, p3/M, z3.h, z29.h\n"
+    "ld1h { z25.h }, p2/Z, [x13, x16, LSL #1]\n"
+    "ldr x24, [x8, #0xb0]\n"
+    "fmla z22.h, p3/M, z4.h, z10.h\n"
+    "fmla z27.h, p3/M, z3.h, z10.h\n"
+    "fmla z20.h, p3/M, z1.h, z10.h\n"
+    "fmla z9.h, p3/M, z5.h, z12.h\n"
+    "fmla z11.h, p3/M, z2.h, z12.h\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "ldr x23, [x8, #0xb8]\n"
+    "fmla z13.h, p3/M, z8.h, z25.h\n"
+    "fmla z28.h, p3/M, z5.h, z25.h\n"
+    "ld1h { z25.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "ldr x22, [x8, #0xc0]\n"
+    "fmla z26.h, p3/M, z5.h, z10.h\n"
+    "fmla z14.h, p3/M, z2.h, z10.h\n"
+    "ld1h { z29.h }, p2/Z, [x28, x16, LSL #1]\n"
+    "ldr x21, [x8, #0xc8]\n"
+    "fmla z22.h, p3/M, z5.h, z12.h\n"
+    "fmla z27.h, p3/M, z4.h, z12.h\n"
+    "fmla z20.h, p3/M, z2.h, z12.h\n"
+    "fmla z9.h, p3/M, z3.h, z12.h\n"
+    "fmla z24.h, p3/M, z1.h, z12.h\n"
+    "fmla z11.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z10.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "ldr x28, [x8, #0xd8]\n"
+    "fmla z15.h, p3/M, z7.h, z25.h\n"
+    "fmla z18.h, p3/M, z6.h, z25.h\n"
+    "ld1h { z25.h }, p2/Z, [x27, x16, LSL #1]\n"
+    "ldr x20, [x8, #0xd0]\n"
+    "fmla z26.h, p3/M, z7.h, z29.h\n"
+    "fmla z22.h, p3/M, z6.h, z29.h\n"
+    "fmla z14.h, p3/M, z4.h, z29.h\n"
+    "fmla z20.h, p3/M, z3.h, z29.h\n"
+    "fmla z23.h, p3/M, z1.h, z29.h\n"
+    "fmla z30.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
+    "ldr x27, [x8, #0xe0]\n"
+    "fmla z27.h, p3/M, z8.h, z10.h\n"
+    "fmla z21.h, p3/M, z8.h, z25.h\n"
+    "fmla z28.h, p3/M, z7.h, z25.h\n"
+    "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
+    "fmla z13.h, p3/M, z1.h, z10.h\n"
+    "ldr x26, [x8, #0xe8]\n"
+    "fmla z9.h, p3/M, z7.h, z10.h\n"
+    "fmla z24.h, p3/M, z5.h, z10.h\n"
+    "fmla z11.h, p3/M, z4.h, z10.h\n"
+    "fmla z31.h, p3/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x24, x16, LSL #1]\n"
+    "ldr x25, [x8, #0xf0]\n"
+    "fmla z26.h, p3/M, z2.h, z29.h\n"
+    "fmla z22.h, p3/M, z1.h, z29.h\n"
+    "fmla z27.h, p3/M, z0.h, z29.h\n"
+    "fmla z14.h, p3/M, z7.h, z25.h\n"
+    "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "ldr x24, [x8, #0xf8]\n"
+    "fmla z20.h, p3/M, z6.h, z25.h\n"
+    "fmla z23.h, p3/M, z4.h, z25.h\n"
+    "fmla z30.h, p3/M, z3.h, z25.h\n"
+    "fmla z15.h, p3/M, z1.h, z25.h\n"
+    "fmla z18.h, p3/M, z0.h, z25.h\n"
+    "ld1h { z25.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "fmla z13.h, p3/M, z4.h, z25.h\n"
+    "ldr x23, [x8, #0x100]\n"
+    "fmla z21.h, p3/M, z2.h, z25.h\n"
+    "fmla z22.h, p3/M, z2.h, z10.h\n"
+    "fmla z27.h, p3/M, z1.h, z10.h\n"
+    "fmla z9.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "ldr x22, [x8, #0x108]\n"
+    "fmla z26.h, p3/M, z6.h, z29.h\n"
+    "fmla z14.h, p3/M, z3.h, z29.h\n"
+    "fmla z23.h, p3/M, z0.h, z29.h\n"
+    "fmla z24.h, p3/M, z8.h, z25.h\n"
+    "ld1h { z10.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "ldr x21, [x8, #0x110]\n"
+    "fmla z11.h, p3/M, z7.h, z25.h\n"
+    "fmla z31.h, p3/M, z5.h, z25.h\n"
+    "fmla z28.h, p3/M, z1.h, z25.h\n"
+    "ld1h { z25.h }, p2/Z, [x28, x16, LSL #1]\n"
+    "fmla z13.h, p3/M, z2.h, z12.h\n"
+    "ldr x20, [x8, #0x118]\n"
+    "fmla z15.h, p3/M, z0.h, z10.h\n"
+    "fmla z18.h, p3/M, z4.h, z25.h\n"
+    "fmla z21.h, p3/M, z3.h, z25.h\n"
+    "fmla z9.h, p3/M, z8.h, z12.h\n"
+    "fmla z11.h, p3/M, z5.h, z12.h\n"
+    "fmla z14.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x16, LSL #1]\n"
+    "fmla z23.h, p3/M, z3.h, z10.h\n"
+    "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z25.h\n"
+    "fmla z31.h, p3/M, z6.h, z25.h\n"
+    "fmla z15.h, p3/M, z5.h, z25.h\n"
+    "fmla z13.h, p3/M, z5.h, z12.h\n"
+    "fmla z28.h, p3/M, z2.h, z12.h\n"
+    "fmla z18.h, p3/M, z7.h, z29.h\n"
+    "fmla z21.h, p3/M, z6.h, z29.h\n"
+    "fmla z23.h, p3/M, z8.h, z25.h\n"
+    "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
+    "fmla z15.h, p3/M, z8.h, z29.h\n"
+    "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z25.h\n"
+    "fmla z31.h, p3/M, z7.h, z25.h\n"
+    "fmla z13.h, p3/M, z6.h, z25.h\n"
+    "fmla z18.h, p3/M, z5.h, z25.h\n"
+    "fmla z21.h, p3/M, z4.h, z25.h\n"
+    "fmla z28.h, p3/M, z3.h, z25.h\n"
+    "ld1h { z25.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "ldp x27, x26, [x8, #0x0]\n"
+    "fmla z11.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x16, LSL #1]\n"
+    "fmla z26.h, p3/M, z4.h, z29.h\n"
+    "fmax z26.h, p3/M, z26.h, z16.h\n"
+    "fmla z22.h, p3/M, z3.h, z29.h\n"
+    "fmla z27.h, p3/M, z5.h, z25.h\n"
+    "fmax z22.h, p3/M, z22.h, z16.h\n"
+    "fmax z27.h, p3/M, z27.h, z16.h\n"
+    "fmla z9.h, p3/M, z4.h, z25.h\n"
+    "fmla z18.h, p3/M, z8.h, z12.h\n"
+    "fmax z9.h, p3/M, z9.h, z16.h\n"
+    "fmin z26.h, p3/M, z26.h, z19.h\n"
+    "fmla z21.h, p3/M, z7.h, z12.h\n"
+    "fmla z28.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z10.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "fmin z22.h, p3/M, z22.h, z19.h\n"
+    "fmla z14.h, p3/M, z1.h, z29.h\n"
+    "fmla z20.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "fmin z27.h, p3/M, z27.h, z19.h\n"
+    "fmla z24.h, p3/M, z2.h, z25.h\n"
+    "fmla z11.h, p3/M, z1.h, z25.h\n"
+    "fmin z9.h, p3/M, z9.h, z19.h\n"
+    "fmax z14.h, p3/M, z14.h, z16.h\n"
+    "fmla z23.h, p3/M, z7.h, z10.h\n"
+    "fmla z30.h, p3/M, z6.h, z10.h\n"
+    "fmax z20.h, p3/M, z20.h, z16.h\n"
+    "fmax z24.h, p3/M, z24.h, z16.h\n"
+    "fmla z31.h, p3/M, z8.h, z12.h\n"
+    "fmla z13.h, p3/M, z7.h, z12.h\n"
+    "fmax z11.h, p3/M, z11.h, z16.h\n"
+    "st1h { z26.h }, p1, [x12, x15, LSL #1]\n"
+    "st1h { z22.h }, p1, [x11, x15, LSL #1]\n"
+    "ldr x23, [x14, #0x20]\n"
+    "ldr x22, [x14, #0x28]\n"
+    "fmla z15.h, p3/M, z4.h, z10.h\n"
+    "st1h { z27.h }, p1, [x10, x15, LSL #1]\n"
+    "ldr x21, [x14, #0x30]\n"
+    "fmla z18.h, p3/M, z3.h, z10.h\n"
+    "fmla z21.h, p3/M, z5.h, z12.h\n"
+    "st1h { z9.h }, p1, [x9, x15, LSL #1]\n"
+    "ldr x20, [x14, #0x38]\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    "ldp x25, x24, [x8, #0x10]\n"
+    "fmin z14.h, p3/M, z14.h, z19.h\n"
+    "fmin z20.h, p3/M, z20.h, z19.h\n"
+    "st1h { z14.h }, p1, [x23, x15, LSL #1]\n"
+    "ldr x23, [x14, #0x40]\n"
+    "fmin z24.h, p3/M, z24.h, z19.h\n"
+    "fmin z11.h, p3/M, z11.h, z19.h\n"
+    "st1h { z20.h }, p1, [x22, x15, LSL #1]\n"
+    "ldr x22, [x14, #0x48]\n"
+    "fmax z23.h, p3/M, z23.h, z16.h\n"
+    "fmax z30.h, p3/M, z30.h, z16.h\n"
+    "st1h { z24.h }, p1, [x21, x15, LSL #1]\n"
+    "ldr x21, [x14, #0x50]\n"
+    "fmax z31.h, p3/M, z31.h, z16.h\n"
+    "fmax z13.h, p3/M, z13.h, z16.h\n"
+    "st1h { z11.h }, p1, [x20, x15, LSL #1]\n"
+    "ldr x20, [x14, #0x58]\n"
+    "inch x16\n"
+    "ld1h { z9.h }, p0/Z, [x27, x17, LSL #1]\n"
+    "ld1h { z10.h }, p0/Z, [x26, x17, LSL #1]\n"
+    "fmin z23.h, p3/M, z23.h, z19.h\n"
+    "ld1h { z11.h }, p0/Z, [x25, x17, LSL #1]\n"
+    "ld1h { z12.h }, p0/Z, [x24, x17, LSL #1]\n"
+    "inch x17\n"
+    "fmin z30.h, p3/M, z30.h, z19.h\n"
+    "fmin z31.h, p3/M, z31.h, z19.h\n"
+    "fmin z13.h, p3/M, z13.h, z19.h\n"
+    "st1h { z23.h }, p1, [x23, x15, LSL #1]\n"
+    "ldr x23, [x14, #0x60]\n"
+    "fmax z15.h, p3/M, z15.h, z16.h\n"
+    "fmax z18.h, p3/M, z18.h, z16.h\n"
+    "st1h { z30.h }, p1, [x22, x15, LSL #1]\n"
+    "ldr x22, [x14, #0x68]\n"
+    "fmax z21.h, p3/M, z21.h, z16.h\n"
+    "fmax z28.h, p3/M, z28.h, z16.h\n"
+    "st1h { z31.h }, p1, [x21, x15, LSL #1]\n"
+    "ldr x21, [x14, #0x70]\n"
+    "st1h { z13.h }, p1, [x20, x15, LSL #1]\n"
+    "ldr x20, [x14, #0x78]\n"
+    "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n"
+    "whilelt p2.h, x16, %x[n_channels]\n"
+    "ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n"
+    "ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n"
+    "cmp x17, %x[n_channels]\n"
+    "fmin z15.h, p3/M, z15.h, z19.h\n"
+    "ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n"
+    "ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n"
+    "fmin z18.h, p3/M, z18.h, z19.h\n"
+    "fmin z21.h, p3/M, z21.h, z19.h\n"
+    "ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n"
+    "ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n"
+    "addvl x7, x7, #16\n"
+    "fmin z28.h, p3/M, z28.h, z19.h\n"
+    "st1h { z15.h }, p1, [x23, x15, LSL #1]\n"
+    "ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n"
+    "ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n"
+    "addvl x7, x7, #-6\n"
+    "st1h { z18.h }, p1, [x22, x15, LSL #1]\n"
+    "st1h { z21.h }, p1, [x21, x15, LSL #1]\n"
+    "st1h { z28.h }, p1, [x20, x15, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z14, z17\n fmla z14.h, p3/M, z4.h, z9.h\n"
+    "movprfx z18, z17\n fmla z18.h, p3/M, z8.h, z9.h\n"
+    "ldr x27, [x8, #0x20]\n"
+    "ldr x24, [x8, #0x30]\n"
+    "movprfx z15, z17\n fmla z15.h, p3/M, z3.h, z9.h\n"
+    "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "ldr x23, [x8, #0x28]\n"
+    "ldr x22, [x8, #0x38]\n"
+    "movprfx z20, z17\n fmla z20.h, p3/M, z0.h, z9.h\n"
+    "movprfx z13, z17\n fmla z13.h, p3/M, z7.h, z9.h\n"
+    "ldr x26, [x8, #0x40]\n"
+    "ldr x21, [x8, #0x48]\n"
+    "movprfx z22, z17\n fmla z22.h, p3/M, z6.h, z9.h\n"
+    "fmla z14.h, p3/M, z5.h, z12.h\n"
+    "ldr x25, [x8, #0x50]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "movprfx z27, z17\n fmla z27.h, p3/M, z5.h, z9.h\n"
+    "movprfx z31, z17\n fmla z31.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z23.h }, p2/Z, [x24, x16, LSL #1]\n"
+    "ldr x13, [x8, #0x70]\n"
+    "fmla z18.h, p3/M, z0.h, z10.h\n"
+    "movprfx z9, z17\n fmla z9.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z21.h }, p2/Z, [x27, x16, LSL #1]\n"
+    "ld1h { z25.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "fmla z15.h, p3/M, z4.h, z12.h\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "ldr x24, [x8, #0x60]\n"
+    "ldr x23, [x8, #0x68]\n"
+    "fmla z20.h, p3/M, z1.h, z12.h\n"
+    "fmla z13.h, p3/M, z8.h, z12.h\n"
+    "inch x15\n"
+    "mov p0.b, p2.b\n"
+    "fmla z22.h, p3/M, z7.h, z12.h\n"
+    "movprfx z28, z17\n fmla z28.h, p3/M, z6.h, z21.h\n"
+    "ld1h { z29.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "ldr x28, [x8, #0x88]\n"
+    "fmla z14.h, p3/M, z7.h, z23.h\n"
+    "fmla z9.h, p3/M, z6.h, z12.h\n"
+    "ldr x12, [x14, #0x0]\n"
+    "ldr x11, [x14, #0x8]\n"
+    "movprfx z11, z17\n fmla z11.h, p3/M, z3.h, z12.h\n"
+    "movprfx z10, z17\n fmla z10.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "ldr x22, [x8, #0x78]\n"
+    "movprfx z26, z17\n fmla z26.h, p3/M, z8.h, z25.h\n"
+    "fmla z15.h, p3/M, z6.h, z23.h\n"
+    "ld1h { z21.h }, p2/Z, [x26, x16, LSL #1]\n"
+    "ldr x21, [x8, #0x80]\n"
+    "fmla z30.h, p3/M, z4.h, z23.h\n"
+    "fmla z20.h, p3/M, z3.h, z23.h\n"
+    "ldr x10, [x14, #0x10]\n"
+    "ldr x9, [x14, #0x18]\n"
+    "movprfx z25, z17\n fmla z25.h, p3/M, z1.h, z23.h\n"
+    "movprfx z24, z17\n fmla z24.h, p3/M, z0.h, z23.h\n"
+    "fmla z27.h, p3/M, z8.h, z23.h\n"
+    "fmla z31.h, p3/M, z5.h, z23.h\n"
+    "fmla z28.h, p3/M, z2.h, z23.h\n"
+    "fmla z18.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z23.h }, p2/Z, [x25, x16, LSL #1]\n"
+    "ldr x27, [x8, #0x90]\n"
+    "fmla z13.h, p3/M, z0.h, z12.h\n"
+    "fmla z22.h, p3/M, z2.h, z21.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "ldr x20, [x8, #0x98]\n"
+    "fmla z14.h, p3/M, z8.h, z29.h\n"
+    "fmla z9.h, p3/M, z1.h, z21.h\n"
+    "ld1h { z21.h }, p2/Z, [x24, x16, LSL #1]\n"
+    "ldr x26, [x8, #0xa0]\n"
+    "fmla z15.h, p3/M, z7.h, z29.h\n"
+    "fmla z11.h, p3/M, z6.h, z29.h\n"
+    "fmla z30.h, p3/M, z5.h, z29.h\n"
+    "fmla z20.h, p3/M, z4.h, z29.h\n"
+    "fmla z10.h, p3/M, z3.h, z29.h\n"
+    "fmla z25.h, p3/M, z2.h, z29.h\n"
+    "fmla z24.h, p3/M, z1.h, z29.h\n"
+    "fmla z26.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "ldr x25, [x8, #0xa8]\n"
+    "fmla z18.h, p3/M, z3.h, z23.h\n"
+    "fmla z27.h, p3/M, z0.h, z23.h\n"
+    "fmla z31.h, p3/M, z6.h, z21.h\n"
+    "fmla z28.h, p3/M, z3.h, z21.h\n"
+    "ld1h { z21.h }, p2/Z, [x13, x16, LSL #1]\n"
+    "ldr x24, [x8, #0xb0]\n"
+    "fmla z13.h, p3/M, z4.h, z29.h\n"
+    "fmla z22.h, p3/M, z3.h, z29.h\n"
+    "fmla z14.h, p3/M, z1.h, z29.h\n"
+    "fmla z9.h, p3/M, z5.h, z12.h\n"
+    "fmla z11.h, p3/M, z2.h, z12.h\n"
+    "fmla z15.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z17.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "ldr x23, [x8, #0xb8]\n"
+    "fmla z10.h, p3/M, z8.h, z21.h\n"
+    "fmla z26.h, p3/M, z5.h, z21.h\n"
+    "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "ldr x22, [x8, #0xc0]\n"
+    "fmla z18.h, p3/M, z5.h, z29.h\n"
+    "fmla z27.h, p3/M, z2.h, z29.h\n"
+    "ld1h { z21.h }, p2/Z, [x28, x16, LSL #1]\n"
+    "ldr x21, [x8, #0xc8]\n"
+    "fmla z13.h, p3/M, z5.h, z17.h\n"
+    "fmla z22.h, p3/M, z4.h, z17.h\n"
+    "fmla z14.h, p3/M, z2.h, z17.h\n"
+    "fmla z9.h, p3/M, z3.h, z17.h\n"
+    "fmla z15.h, p3/M, z1.h, z17.h\n"
+    "fmla z11.h, p3/M, z0.h, z17.h\n"
+    "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "ldr x28, [x8, #0xd8]\n"
+    "fmla z28.h, p3/M, z7.h, z23.h\n"
+    "fmla z25.h, p3/M, z6.h, z23.h\n"
+    "ld1h { z23.h }, p2/Z, [x27, x16, LSL #1]\n"
+    "ldr x20, [x8, #0xd0]\n"
+    "fmla z18.h, p3/M, z7.h, z21.h\n"
+    "fmla z13.h, p3/M, z6.h, z21.h\n"
+    "fmla z27.h, p3/M, z4.h, z21.h\n"
+    "fmla z14.h, p3/M, z3.h, z21.h\n"
+    "fmla z31.h, p3/M, z1.h, z21.h\n"
+    "fmla z30.h, p3/M, z0.h, z21.h\n"
+    "ld1h { z21.h }, p2/Z, [x26, x16, LSL #1]\n"
+    "ldr x27, [x8, #0xe0]\n"
+    "fmla z22.h, p3/M, z8.h, z29.h\n"
+    "fmla z24.h, p3/M, z8.h, z23.h\n"
+    "fmla z26.h, p3/M, z7.h, z23.h\n"
+    "ld1h { z23.h }, p2/Z, [x25, x16, LSL #1]\n"
+    "fmla z10.h, p3/M, z1.h, z29.h\n"
+    "ldr x26, [x8, #0xe8]\n"
+    "fmla z9.h, p3/M, z7.h, z29.h\n"
+    "fmla z15.h, p3/M, z5.h, z29.h\n"
+    "fmla z11.h, p3/M, z4.h, z29.h\n"
+    "fmla z20.h, p3/M, z2.h, z29.h\n"
+    "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
+    "ldr x25, [x8, #0xf0]\n"
+    "fmla z18.h, p3/M, z2.h, z21.h\n"
+    "fmla z13.h, p3/M, z1.h, z21.h\n"
+    "fmla z22.h, p3/M, z0.h, z21.h\n"
+    "fmla z27.h, p3/M, z7.h, z23.h\n"
+    "ld1h { z21.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "ldr x24, [x8, #0xf8]\n"
+    "fmla z14.h, p3/M, z6.h, z23.h\n"
+    "fmla z31.h, p3/M, z4.h, z23.h\n"
+    "fmla z30.h, p3/M, z3.h, z23.h\n"
+    "fmla z28.h, p3/M, z1.h, z23.h\n"
+    "fmla z25.h, p3/M, z0.h, z23.h\n"
+    "ld1h { z17.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "fmla z10.h, p3/M, z4.h, z17.h\n"
+    "ldr x23, [x8, #0x100]\n"
+    "fmla z24.h, p3/M, z2.h, z17.h\n"
+    "fmla z13.h, p3/M, z2.h, z29.h\n"
+    "fmla z22.h, p3/M, z1.h, z29.h\n"
+    "fmla z9.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "ldr x22, [x8, #0x108]\n"
+    "fmla z18.h, p3/M, z6.h, z21.h\n"
+    "fmla z27.h, p3/M, z3.h, z21.h\n"
+    "fmla z31.h, p3/M, z0.h, z21.h\n"
+    "fmla z15.h, p3/M, z8.h, z17.h\n"
+    "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "ldr x21, [x8, #0x110]\n"
+    "fmla z11.h, p3/M, z7.h, z17.h\n"
+    "fmla z20.h, p3/M, z5.h, z17.h\n"
+    "fmla z26.h, p3/M, z1.h, z17.h\n"
+    "ld1h { z21.h }, p2/Z, [x28, x16, LSL #1]\n"
+    "fmla z10.h, p3/M, z2.h, z23.h\n"
+    "ldr x20, [x8, #0x118]\n"
+    "fmla z28.h, p3/M, z0.h, z29.h\n"
+    "fmla z25.h, p3/M, z4.h, z21.h\n"
+    "fmla z24.h, p3/M, z3.h, z21.h\n"
+    "fmla z9.h, p3/M, z8.h, z23.h\n"
+    "fmla z11.h, p3/M, z5.h, z23.h\n"
+    "fmla z27.h, p3/M, z6.h, z29.h\n"
+    "ld1h { z23.h }, p2/Z, [x27, x16, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z29.h\n"
+    "ld1h { z17.h }, p2/Z, [x26, x16, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z21.h\n"
+    "fmla z20.h, p3/M, z6.h, z21.h\n"
+    "fmla z28.h, p3/M, z5.h, z21.h\n"
+    "fmla z10.h, p3/M, z5.h, z23.h\n"
+    "fmla z26.h, p3/M, z2.h, z23.h\n"
+    "fmla z25.h, p3/M, z7.h, z17.h\n"
+    "fmla z24.h, p3/M, z6.h, z17.h\n"
+    "fmla z31.h, p3/M, z8.h, z21.h\n"
+    "ld1h { z21.h }, p2/Z, [x25, x16, LSL #1]\n"
+    "fmla z28.h, p3/M, z8.h, z17.h\n"
+    "ld1h { z12.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z21.h\n"
+    "fmla z20.h, p3/M, z7.h, z21.h\n"
+    "fmla z10.h, p3/M, z6.h, z21.h\n"
+    "fmla z25.h, p3/M, z5.h, z21.h\n"
+    "fmla z24.h, p3/M, z4.h, z21.h\n"
+    "fmla z26.h, p3/M, z3.h, z21.h\n"
+    "ld1h { z21.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "fmla z11.h, p3/M, z8.h, z23.h\n"
+    "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
+    "fmla z18.h, p3/M, z4.h, z12.h\n"
+    "fmax z18.h, p3/M, z18.h, z16.h\n"
+    "fmla z13.h, p3/M, z3.h, z12.h\n"
+    "fmla z22.h, p3/M, z5.h, z21.h\n"
+    "fmax z13.h, p3/M, z13.h, z16.h\n"
+    "fmax z22.h, p3/M, z22.h, z16.h\n"
+    "fmla z9.h, p3/M, z4.h, z21.h\n"
+    "fmla z25.h, p3/M, z8.h, z29.h\n"
+    "fmax z9.h, p3/M, z9.h, z16.h\n"
+    "fmin z18.h, p3/M, z18.h, z19.h\n"
+    "fmla z24.h, p3/M, z7.h, z29.h\n"
+    "fmla z26.h, p3/M, z6.h, z29.h\n"
+    "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "fmin z13.h, p3/M, z13.h, z19.h\n"
+    "fmla z27.h, p3/M, z1.h, z12.h\n"
+    "fmla z14.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "fmin z22.h, p3/M, z22.h, z19.h\n"
+    "fmla z15.h, p3/M, z2.h, z21.h\n"
+    "fmla z11.h, p3/M, z1.h, z21.h\n"
+    "fmin z9.h, p3/M, z9.h, z19.h\n"
+    "fmax z27.h, p3/M, z27.h, z16.h\n"
+    "fmla z31.h, p3/M, z7.h, z23.h\n"
+    "fmla z30.h, p3/M, z6.h, z23.h\n"
+    "fmax z14.h, p3/M, z14.h, z16.h\n"
+    "fmax z15.h, p3/M, z15.h, z16.h\n"
+    "fmla z20.h, p3/M, z8.h, z29.h\n"
+    "fmla z10.h, p3/M, z7.h, z29.h\n"
+    "fmax z11.h, p3/M, z11.h, z16.h\n"
+    "st1h { z18.h }, p0, [x12, x15, LSL #1]\n"
+    "st1h { z13.h }, p0, [x11, x15, LSL #1]\n"
+    "ldr x23, [x14, #0x20]\n"
+    "ldr x22, [x14, #0x28]\n"
+    "fmla z28.h, p3/M, z4.h, z23.h\n"
+    "st1h { z22.h }, p0, [x10, x15, LSL #1]\n"
+    "ldr x21, [x14, #0x30]\n"
+    "fmla z25.h, p3/M, z3.h, z23.h\n"
+    "fmla z24.h, p3/M, z5.h, z29.h\n"
+    "st1h { z9.h }, p0, [x9, x15, LSL #1]\n"
+    "ldr x20, [x14, #0x38]\n"
+    "fmla z26.h, p3/M, z4.h, z29.h\n"
+    "fmin z27.h, p3/M, z27.h, z19.h\n"
+    "fmin z14.h, p3/M, z14.h, z19.h\n"
+    "fmin z15.h, p3/M, z15.h, z19.h\n"
+    "st1h { z27.h }, p0, [x23, x15, LSL #1]\n"
+    "ldr x23, [x14, #0x40]\n"
+    "fmin z11.h, p3/M, z11.h, z19.h\n"
+    "fmax z31.h, p3/M, z31.h, z16.h\n"
+    "st1h { z14.h }, p0, [x22, x15, LSL #1]\n"
+    "ldr x22, [x14, #0x48]\n"
+    "fmax z30.h, p3/M, z30.h, z16.h\n"
+    "fmax z20.h, p3/M, z20.h, z16.h\n"
+    "st1h { z15.h }, p0, [x21, x15, LSL #1]\n"
+    "ldr x21, [x14, #0x50]\n"
+    "fmax z10.h, p3/M, z10.h, z16.h\n"
+    "st1h { z11.h }, p0, [x20, x15, LSL #1]\n"
+    "ldr x20, [x14, #0x58]\n"
+    "fmin z31.h, p3/M, z31.h, z19.h\n"
+    "fmin z30.h, p3/M, z30.h, z19.h\n"
+    "fmin z20.h, p3/M, z20.h, z19.h\n"
+    "st1h { z31.h }, p0, [x23, x15, LSL #1]\n"
+    "ldr x23, [x14, #0x60]\n"
+    "fmin z10.h, p3/M, z10.h, z19.h\n"
+    "fmax z28.h, p3/M, z28.h, z16.h\n"
+    "st1h { z30.h }, p0, [x22, x15, LSL #1]\n"
+    "ldr x22, [x14, #0x68]\n"
+    "fmax z25.h, p3/M, z25.h, z16.h\n"
+    "fmax z24.h, p3/M, z24.h, z16.h\n"
+    "st1h { z20.h }, p0, [x21, x15, LSL #1]\n"
+    "ldr x21, [x14, #0x70]\n"
+    "fmax z26.h, p3/M, z26.h, z16.h\n"
+    "st1h { z10.h }, p0, [x20, x15, LSL #1]\n"
+    "ldr x20, [x14, #0x78]\n"
+    "fmin z28.h, p3/M, z28.h, z19.h\n"
+    "fmin z25.h, p3/M, z25.h, z19.h\n"
+    "fmin z24.h, p3/M, z24.h, z19.h\n"
+    "st1h { z28.h }, p0, [x23, x15, LSL #1]\n"
+    "fmin z26.h, p3/M, z26.h, z19.h\n"
+    "st1h { z25.h }, p0, [x22, x15, LSL #1]\n"
+    "st1h { z24.h }, p0, [x21, x15, LSL #1]\n"
+    "st1h { z26.h }, p0, [x20, x15, LSL #1]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..d8a25666bd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..58decdba1c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x11, #0x0\n"
+    "mov x16, #0x0\n"
+    "1:"  // Tile loop
+    "str x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x4\n"
+    "mov x24, #0x2\n"
+    "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x11, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x16, x15, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "cnth x13\n"
+    "mul x20, x11, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x11, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x10, x15, x15\n"
+    "mul x22, x22, x25\n"  // offset *= kernel_stride * output_size
+    "add x12, x12, x22, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "ldr x9, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x28, x12, x23, LSL #1\n"
+    "madd x20, x16, x14, x20\n"  // offset += tile_j * ld_output_col
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z30.h }, p3/Z, [x11]\n"
+    "ld1h { z0.h }, p3/Z, [x11, #1, MUL VL]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "ld1h { z1.h }, p3/Z, [x11, #2, MUL VL]\n"
+    "ld1h { z2.h }, p3/Z, [x11, #3, MUL VL]\n"
+    "add x27, x28, x23, LSL #1\n"
+    "ld1h { z3.h }, p3/Z, [x11, #4, MUL VL]\n"
+    "ld1h { z4.h }, p3/Z, [x11, #5, MUL VL]\n"
+    "add x26, x10, x15\n"
+    "add x25, x27, x23, LSL #1\n"
+    "ld1h { z5.h }, p3/Z, [x11, #6, MUL VL]\n"
+    "ld1h { z6.h }, p3/Z, [x11, #7, MUL VL]\n"
+    "addvl x11, x11, #16\n"
+    "add x24, x26, x15\n"
+    "add x9, x9, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "cmp x13, %x[n_channels]\n"
+    "ld1rh { z29.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rh { z28.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x23, x25, x23, LSL #1\n"
+    "add x22, x9, x21, LSL #1\n"
+    "ld1h { z7.h }, p3/Z, [x11, #-8, MUL VL]\n"
+    "ld1h { z8.h }, p3/Z, [x11, #-7, MUL VL]\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x13\n"
+    "ld1h { z9.h }, p2/Z, [x27, x10, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x12]\n"
+    "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x12, x26, LSL #1]\n"
+    "addvl x11, x11, #-6\n"
+    "ld1h { z13.h }, p2/Z, [x12, x24, LSL #1]\n"
+    "ld1h { z14.h }, p2/Z, [x28]\n"
+    "ld1h { z15.h }, p2/Z, [x28, x15, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x12, x10, LSL #1]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z27, z30\n fmla z27.h, p3/M, z8.h, z9.h\n"
+    "movprfx z26, z30\n fmla z26.h, p3/M, z6.h, z9.h\n"
+    "whilelt p1.h, x13, %x[n_channels]\n"
+    "inch x21\n"
+    "fmla z27.h, p3/M, z0.h, z10.h\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z20.h }, p2/Z, [x28, x24, LSL #1]\n"
+    "inch x13\n"
+    "fmla z27.h, p3/M, z1.h, z11.h\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z17.h }, p2/Z, [x28, x26, LSL #1]\n"
+    "ld1h { z19.h }, p2/Z, [x28, x10, LSL #1]\n"
+    "fmla z27.h, p3/M, z3.h, z14.h\n"
+    "fmla z26.h, p3/M, z0.h, z16.h\n"
+    "ld1h { z18.h }, p2/Z, [x25]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z27.h, p3/M, z4.h, z15.h\n"
+    "fmla z26.h, p3/M, z4.h, z17.h\n"
+    "ld1h { z25.h }, p2/Z, [x27]\n"
+    "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z2.h, z16.h\n"
+    "fmla z26.h, p3/M, z5.h, z20.h\n"
+    "ld1h { z24.h }, p2/Z, [x27, x26, LSL #1]\n"
+    "ld1h { z23.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "movprfx z22, z30\n fmla z22.h, p3/M, z2.h, z9.h\n"
+    "movprfx z21, z30\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "addvl x12, x12, #1\n"
+    "addvl x28, x28, #1\n"
+    "fmla z27.h, p3/M, z5.h, z19.h\n"
+    "fmla z26.h, p3/M, z3.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x25, x26, LSL #1]\n"
+    "ld1h { z30.h }, p3/Z, [x11]\n"
+    "fmla z22.h, p3/M, z3.h, z18.h\n"
+    "fmla z21.h, p3/M, z4.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x25, x24, LSL #1]\n"
+    "ld1h { z20.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z22.h, p3/M, z0.h, z25.h\n"
+    "fmla z21.h, p3/M, z1.h, z24.h\n"
+    "ld1h { z0.h }, p3/Z, [x11, #1, MUL VL]\n"
+    "inch x20\n"
+    "fmla z22.h, p3/M, z4.h, z17.h\n"
+    "fmla z21.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z19.h }, p2/Z, [x27, x24, LSL #1]\n"
+    "ld1h { z18.h }, p2/Z, [x23, x26, LSL #1]\n"
+    "fmla z27.h, p3/M, z6.h, z25.h\n"
+    "fmla z22.h, p3/M, z1.h, z23.h\n"
+    "ld1h { z17.h }, p2/Z, [x23]\n"
+    "addvl x27, x27, #1\n"
+    "fmla z21.h, p3/M, z2.h, z19.h\n"
+    "fmla z27.h, p3/M, z7.h, z23.h\n"
+    "ld1h { z16.h }, p2/Z, [x25, x10, LSL #1]\n"
+    "fmax z27.h, p3/M, z27.h, z29.h\n"
+    "fmla z22.h, p3/M, z6.h, z17.h\n"
+    "fmla z21.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x10, LSL #1]\n"
+    "ld1h { z1.h }, p3/Z, [x11, #2, MUL VL]\n"
+    "fmla z22.h, p3/M, z7.h, z20.h\n"
+    "fmla z21.h, p3/M, z7.h, z18.h\n"
+    "ld1h { z2.h }, p3/Z, [x11, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x11, #4, MUL VL]\n"
+    "fmla z26.h, p3/M, z7.h, z24.h\n"
+    "fmla z22.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z4.h }, p3/Z, [x11, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x11, #6, MUL VL]\n"
+    "fmla z21.h, p3/M, z6.h, z17.h\n"
+    "fmla z26.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+    "fmax z26.h, p3/M, z26.h, z29.h\n"
+    "fmla z22.h, p3/M, z8.h, z17.h\n"
+    "fmla z21.h, p3/M, z8.h, z16.h\n"
+    "fmax z22.h, p3/M, z22.h, z29.h\n"
+    "fmax z21.h, p3/M, z21.h, z29.h\n"
+    "ld1h { z6.h }, p3/Z, [x11, #7, MUL VL]\n"
+    "addvl x11, x11, #16\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "ld1h { z9.h }, p1/Z, [x27, x10, LSL #1]\n"
+    "cmp x13, %x[n_channels]\n"
+    "fmin z27.h, p3/M, z27.h, z28.h\n"
+    "ld1h { z10.h }, p1/Z, [x12]\n"
+    "ld1h { z11.h }, p1/Z, [x12, x15, LSL #1]\n"
+    "fmin z26.h, p3/M, z26.h, z28.h\n"
+    "fmin z22.h, p3/M, z22.h, z28.h\n"
+    "ld1h { z12.h }, p1/Z, [x12, x26, LSL #1]\n"
+    "ld1h { z13.h }, p1/Z, [x12, x24, LSL #1]\n"
+    "fmin z21.h, p3/M, z21.h, z28.h\n"
+    "addvl x25, x25, #1\n"
+    "ld1h { z14.h }, p1/Z, [x28]\n"
+    "ld1h { z15.h }, p1/Z, [x28, x15, LSL #1]\n"
+    "addvl x23, x23, #1\n"
+    "ld1h { z16.h }, p1/Z, [x12, x10, LSL #1]\n"
+    "st1h { z27.h }, p0, [x9]\n"
+    "ld1h { z7.h }, p3/Z, [x11, #-8, MUL VL]\n"
+    "st1h { z26.h }, p0, [x9, x14, LSL #1]\n"
+    "addvl x9, x9, #1\n"
+    "ld1h { z8.h }, p3/Z, [x11, #-7, MUL VL]\n"
+    "addvl x11, x11, #-6\n"
+    "st1h { z22.h }, p0, [x22]\n"
+    "st1h { z21.h }, p0, [x22, x14, LSL #1]\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z27, z30\n fmla z27.h, p3/M, z8.h, z9.h\n"
+    "movprfx z26, z30\n fmla z26.h, p3/M, z6.h, z9.h\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "fmla z27.h, p3/M, z0.h, z10.h\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z20.h }, p2/Z, [x28, x24, LSL #1]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "fmla z27.h, p3/M, z1.h, z11.h\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z17.h }, p2/Z, [x28, x26, LSL #1]\n"
+    "ld1h { z19.h }, p2/Z, [x28, x10, LSL #1]\n"
+    "fmla z27.h, p3/M, z3.h, z14.h\n"
+    "fmla z26.h, p3/M, z0.h, z16.h\n"
+    "ld1h { z18.h }, p2/Z, [x25]\n"
+    "add x16, x16, #0x1\n"
+    "fmla z27.h, p3/M, z4.h, z15.h\n"
+    "fmla z26.h, p3/M, z4.h, z17.h\n"
+    "ld1h { z25.h }, p2/Z, [x27]\n"
+    "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z2.h, z16.h\n"
+    "fmla z26.h, p3/M, z5.h, z20.h\n"
+    "ld1h { z24.h }, p2/Z, [x27, x26, LSL #1]\n"
+    "ld1h { z23.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "movprfx z22, z30\n fmla z22.h, p3/M, z2.h, z9.h\n"
+    "movprfx z21, z30\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "cmp x16, x20\n"
+    "add x21, x11, #0x1\n"
+    "fmla z27.h, p3/M, z5.h, z19.h\n"
+    "fmla z26.h, p3/M, z3.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x25, x26, LSL #1]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z22.h, p3/M, z3.h, z18.h\n"
+    "fmla z21.h, p3/M, z4.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x25, x24, LSL #1]\n"
+    "ld1h { z20.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z22.h, p3/M, z0.h, z25.h\n"
+    "fmla z21.h, p3/M, z1.h, z24.h\n"
+    "csel x11, x11, x21, LT\n"
+    "mov p0.b, p2.b\n"
+    "fmla z22.h, p3/M, z4.h, z17.h\n"
+    "fmla z21.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z19.h }, p2/Z, [x27, x24, LSL #1]\n"
+    "ld1h { z18.h }, p2/Z, [x23, x26, LSL #1]\n"
+    "fmla z27.h, p3/M, z6.h, z25.h\n"
+    "fmla z22.h, p3/M, z1.h, z23.h\n"
+    "ld1h { z17.h }, p2/Z, [x23]\n"
+    "csel x16, x16, XZR, LT\n"
+    "fmla z21.h, p3/M, z2.h, z19.h\n"
+    "fmla z27.h, p3/M, z7.h, z23.h\n"
+    "ld1h { z16.h }, p2/Z, [x25, x10, LSL #1]\n"
+    "fmax z27.h, p3/M, z27.h, z29.h\n"
+    "fmla z22.h, p3/M, z6.h, z17.h\n"
+    "fmla z21.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x10, LSL #1]\n"
+    "cmp x11, x20\n"
+    "fmla z22.h, p3/M, z7.h, z20.h\n"
+    "fmla z21.h, p3/M, z7.h, z18.h\n"
+    "fmin z27.h, p3/M, z27.h, z28.h\n"
+    "st1h { z27.h }, p0, [x9]\n"
+    "fmla z26.h, p3/M, z7.h, z24.h\n"
+    "fmla z22.h, p3/M, z5.h, z16.h\n"
+    "fmla z21.h, p3/M, z6.h, z17.h\n"
+    "fmla z26.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+    "fmax z26.h, p3/M, z26.h, z29.h\n"
+    "fmla z22.h, p3/M, z8.h, z17.h\n"
+    "fmla z21.h, p3/M, z8.h, z16.h\n"
+    "fmax z22.h, p3/M, z22.h, z29.h\n"
+    "fmax z21.h, p3/M, z21.h, z29.h\n"
+    "fmin z26.h, p3/M, z26.h, z28.h\n"
+    "fmin z22.h, p3/M, z22.h, z28.h\n"
+    "st1h { z26.h }, p0, [x9, x14, LSL #1]\n"
+    "fmin z21.h, p3/M, z21.h, z28.h\n"
+    "st1h { z22.h }, p0, [x22]\n"
+    "st1h { z21.h }, p0, [x22, x14, LSL #1]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..d5fbb6baee
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[25];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[1];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[2];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[7];
+      inptrs[11] = input_ptrs[15];
+      inptrs[12] = input_ptrs[10];
+      inptrs[13] = input_ptrs[16];
+      inptrs[14] = input_ptrs[11];
+      inptrs[15] = input_ptrs[18];
+      inptrs[16] = input_ptrs[13];
+      inptrs[17] = input_ptrs[19];
+      inptrs[18] = input_ptrs[20];
+      inptrs[19] = input_ptrs[14];
+      inptrs[20] = input_ptrs[21];
+      inptrs[21] = input_ptrs[17];
+      inptrs[22] = input_ptrs[23];
+      inptrs[23] = input_ptrs[22];
+      inptrs[24] = input_ptrs[24];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "cnth x14\n"
+    "ldp x13, x12, [x20, #0x0]\n"
+    "ldp x11, x10, [x20, #0x10]\n"
+    "mov x9, #0x0\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z20.h }, p3/Z, [x16]\n"
+    "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
+    "cmp x14, %x[n_channels]\n"
+    "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
+    "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
+    "sub x28, XZR, x14\n"
+    "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
+    "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
+    "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
+    "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
+    "addvl x16, x16, #-6\n"
+    "ld1h { z9.h }, p2/Z, [x27, x9, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x26, x9, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x25, x9, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x24, x9, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z14.h }, p2/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z15.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z24, z20\n fmla z24.h, p3/M, z8.h, z9.h\n"
+    "movprfx z23, z20\n fmla z23.h, p3/M, z6.h, z9.h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z23.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x20, [x15, #0x50]\n"
+    "fmla z24.h, p3/M, z1.h, z11.h\n"
+    "fmla z23.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z24.h, p3/M, z3.h, z14.h\n"
+    "fmla z23.h, p3/M, z0.h, z16.h\n"
+    "ldr x20, [x15, #0x58]\n"
+    "ldr x22, [x15, #0x78]\n"
+    "fmla z24.h, p3/M, z4.h, z15.h\n"
+    "fmla z23.h, p3/M, z4.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x60]\n"
+    "fmla z24.h, p3/M, z2.h, z16.h\n"
+    "fmla z23.h, p3/M, z5.h, z18.h\n"
+    "ldr x20, [x15, #0x80]\n"
+    "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "movprfx z22, z20\n fmla z22.h, p3/M, z2.h, z9.h\n"
+    "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z20.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x68]\n"
+    "fmla z24.h, p3/M, z5.h, z19.h\n"
+    "fmla z23.h, p3/M, z3.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla z22.h, p3/M, z3.h, z17.h\n"
+    "fmla z21.h, p3/M, z4.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z0.h, z18.h\n"
+    "fmla z21.h, p3/M, z1.h, z20.h\n"
+    "ldr x21, [x15, #0x70]\n"
+    "ldr x20, [x15, #0x98]\n"
+    "fmla z22.h, p3/M, z4.h, z17.h\n"
+    "fmla z21.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z24.h, p3/M, z6.h, z18.h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "fmla z22.h, p3/M, z1.h, z16.h\n"
+    "fmla z21.h, p3/M, z2.h, z19.h\n"
+    "fmla z24.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0xa0]\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "fmla z22.h, p3/M, z6.h, z16.h\n"
+    "fmla z21.h, p3/M, z3.h, z18.h\n"
+    "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z7.h, z17.h\n"
+    "fmla z21.h, p3/M, z7.h, z16.h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla z23.h, p3/M, z7.h, z20.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z5.h, z18.h\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla z21.h, p3/M, z6.h, z17.h\n"
+    "fmla z23.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z8.h, z17.h\n"
+    "fmla z21.h, p3/M, z8.h, z16.h\n"
+    "whilelt p1.h, x14, %x[n_channels]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "inch x9\n"
+    "fmax z24.h, p3/M, z24.h, z26.h\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ld1h { z9.h }, p1/Z, [x27, x14, LSL #1]\n"
+    "fmax z23.h, p3/M, z23.h, z26.h\n"
+    "fmax z22.h, p3/M, z22.h, z26.h\n"
+    "ld1h { z10.h }, p1/Z, [x26, x14, LSL #1]\n"
+    "ld1h { z11.h }, p1/Z, [x25, x14, LSL #1]\n"
+    "fmax z21.h, p3/M, z21.h, z26.h\n"
+    "inch x28\n"
+    "ld1h { z12.h }, p1/Z, [x24, x14, LSL #1]\n"
+    "ld1h { z13.h }, p1/Z, [x23, x14, LSL #1]\n"
+    "mov p0.b, p2.b\n"
+    "whilelt p2.h, x9, %x[n_channels]\n"
+    "ld1h { z14.h }, p1/Z, [x22, x14, LSL #1]\n"
+    "ld1h { z15.h }, p1/Z, [x21, x14, LSL #1]\n"
+    "fmin z24.h, p3/M, z24.h, z25.h\n"
+    "fmin z23.h, p3/M, z23.h, z25.h\n"
+    "ld1h { z16.h }, p1/Z, [x20, x14, LSL #1]\n"
+    "inch x14\n"
+    "ld1h { z20.h }, p3/Z, [x16]\n"
+    "cmp x14, %x[n_channels]\n"
+    "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
+    "fmin z22.h, p3/M, z22.h, z25.h\n"
+    "fmin z21.h, p3/M, z21.h, z25.h\n"
+    "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
+    "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+    "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
+    "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
+    "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
+    "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
+    "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
+    "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
+    "addvl x16, x16, #-6\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z24, z20\n fmla z24.h, p3/M, z8.h, z9.h\n"
+    "movprfx z23, z20\n fmla z23.h, p3/M, z6.h, z9.h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z23.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x20, [x15, #0x50]\n"
+    "fmla z24.h, p3/M, z1.h, z11.h\n"
+    "fmla z23.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z24.h, p3/M, z3.h, z14.h\n"
+    "fmla z23.h, p3/M, z0.h, z16.h\n"
+    "ldr x20, [x15, #0x58]\n"
+    "ldr x22, [x15, #0x78]\n"
+    "fmla z24.h, p3/M, z4.h, z15.h\n"
+    "fmla z23.h, p3/M, z4.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x60]\n"
+    "fmla z24.h, p3/M, z2.h, z16.h\n"
+    "fmla z23.h, p3/M, z5.h, z18.h\n"
+    "ldr x20, [x15, #0x80]\n"
+    "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "movprfx z22, z20\n fmla z22.h, p3/M, z2.h, z9.h\n"
+    "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z20.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x68]\n"
+    "fmla z24.h, p3/M, z5.h, z19.h\n"
+    "fmla z23.h, p3/M, z3.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla z22.h, p3/M, z3.h, z17.h\n"
+    "fmla z21.h, p3/M, z4.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z0.h, z18.h\n"
+    "fmla z21.h, p3/M, z1.h, z20.h\n"
+    "ldr x21, [x15, #0x70]\n"
+    "ldr x20, [x15, #0x98]\n"
+    "fmla z22.h, p3/M, z4.h, z17.h\n"
+    "fmla z21.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z24.h, p3/M, z6.h, z18.h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "fmla z22.h, p3/M, z1.h, z16.h\n"
+    "fmla z21.h, p3/M, z2.h, z19.h\n"
+    "fmla z24.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0xa0]\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "fmla z22.h, p3/M, z6.h, z16.h\n"
+    "fmla z21.h, p3/M, z3.h, z18.h\n"
+    "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z7.h, z17.h\n"
+    "fmla z21.h, p3/M, z7.h, z16.h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla z23.h, p3/M, z7.h, z20.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z5.h, z18.h\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla z21.h, p3/M, z6.h, z17.h\n"
+    "fmla z23.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z8.h, z17.h\n"
+    "fmla z21.h, p3/M, z8.h, z16.h\n"
+    "inch x28\n"
+    "mov p0.b, p2.b\n"
+    "fmax z24.h, p3/M, z24.h, z26.h\n"
+    "fmax z23.h, p3/M, z23.h, z26.h\n"
+    "fmax z22.h, p3/M, z22.h, z26.h\n"
+    "fmax z21.h, p3/M, z21.h, z26.h\n"
+    "fmin z24.h, p3/M, z24.h, z25.h\n"
+    "fmin z23.h, p3/M, z23.h, z25.h\n"
+    "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+    "fmin z22.h, p3/M, z22.h, z25.h\n"
+    "fmin z21.h, p3/M, z21.h, z25.h\n"
+    "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
+    "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
+    "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..abdfac5a3f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..fdbee67926
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x12, #0x0\n"
+    "mov x8, #0x0\n"
+    "1:"  // Tile loop
+    "str x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x2\n"
+    "mov x24, #0x2\n"
+    "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x12, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x8, x17, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "add x15, x17, x17\n"
+    "mul x20, x12, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "cnth x12\n"
+    "mul x22, x22, x25\n"  // offset *= kernel_stride * output_size
+    "add x14, x14, x22, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x11, x14, x23, LSL #1\n"
+    "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x8, x16, x20\n"  // offset += tile_j * ld_output_col
+    "add x9, x11, x23, LSL #1\n"
+    "add x28, x15, x17\n"
+    "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "add x27, x9, x23, LSL #1\n"
+    "ld1rh { z28.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x26, x28, x17\n"
+    "add x25, x27, x23, LSL #1\n"
+    "ld1h { z29.h }, p3/Z, [x10]\n"
+    "ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
+    "add x24, x26, x17\n"
+    "add x13, x13, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
+    "ld1h { z2.h }, p3/Z, [x10, #3, MUL VL]\n"
+    "cmp x12, %x[n_channels]\n"
+    "add x23, x25, x23, LSL #1\n"
+    "ld1h { z3.h }, p3/Z, [x10, #4, MUL VL]\n"
+    "ld1h { z4.h }, p3/Z, [x10, #5, MUL VL]\n"
+    "add x22, x13, x21, LSL #1\n"
+    "mov x21, #0x0\n"
+    "ld1h { z5.h }, p2/Z, [x14]\n"
+    "ld1h { z6.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "sub x20, XZR, x12\n"
+    "ld1h { z7.h }, p2/Z, [x11]\n"
+    "ld1h { z8.h }, p2/Z, [x11, x17, LSL #1]\n"
+    "addvl x10, x10, #6\n"
+    "ld1h { z9.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x14, x28, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x14, x26, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x11, x24, LSL #1]\n"
+    "ld1h { z14.h }, p2/Z, [x9]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z27, z29\n fmla z27.h, p3/M, z0.h, z5.h\n"
+    "movprfx z31, z29\n fmla z31.h, p3/M, z0.h, z6.h\n"
+    "ld1h { z24.h }, p2/Z, [x11, x28, LSL #1]\n"
+    "whilelt p1.h, x12, %x[n_channels]\n"
+    "movprfx z26, z29\n fmla z26.h, p3/M, z0.h, z7.h\n"
+    "movprfx z30, z29\n fmla z30.h, p3/M, z0.h, z8.h\n"
+    "ld1h { z18.h }, p3/Z, [x10]\n"
+    "inch x21\n"
+    "fmla z27.h, p3/M, z1.h, z6.h\n"
+    "fmla z31.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z23.h }, p2/Z, [x11, x26, LSL #1]\n"
+    "inch x12\n"
+    "fmla z26.h, p3/M, z1.h, z8.h\n"
+    "fmla z30.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z22.h }, p3/Z, [x10, #1, MUL VL]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z27.h, p3/M, z2.h, z9.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z16.h }, p2/Z, [x14, x24, LSL #1]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "fmla z30.h, p3/M, z2.h, z24.h\n"
+    "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
+    "addvl x11, x11, #1\n"
+    "fmla z27.h, p3/M, z3.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z0.h }, p2/Z, [x9, x17, LSL #1]\n"
+    "inch x20\n"
+    "fmla z26.h, p3/M, z3.h, z24.h\n"
+    "fmla z30.h, p3/M, z3.h, z23.h\n"
+    "ld1h { z17.h }, p3/Z, [x10, #3, MUL VL]\n"
+    "fmla z27.h, p3/M, z4.h, z12.h\n"
+    "fmla z31.h, p3/M, z4.h, z16.h\n"
+    "ld1h { z19.h }, p2/Z, [x9, x15, LSL #1]\n"
+    "ld1h { z5.h }, p2/Z, [x9, x28, LSL #1]\n"
+    "fmla z26.h, p3/M, z4.h, z23.h\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z21.h }, p3/Z, [x10, #4, MUL VL]\n"
+    "fmla z27.h, p3/M, z18.h, z7.h\n"
+    "fmla z31.h, p3/M, z18.h, z8.h\n"
+    "ld1h { z7.h }, p1/Z, [x11]\n"
+    "fmla z26.h, p3/M, z18.h, z14.h\n"
+    "fmla z30.h, p3/M, z18.h, z0.h\n"
+    "ld1h { z18.h }, p3/Z, [x10, #5, MUL VL]\n"
+    "fmla z27.h, p3/M, z22.h, z8.h\n"
+    "fmla z31.h, p3/M, z22.h, z13.h\n"
+    "ld1h { z3.h }, p2/Z, [x9, x24, LSL #1]\n"
+    "fmla z26.h, p3/M, z22.h, z0.h\n"
+    "fmla z30.h, p3/M, z22.h, z19.h\n"
+    "ld1h { z8.h }, p3/Z, [x10, #6, MUL VL]\n"
+    "fmla z27.h, p3/M, z20.h, z13.h\n"
+    "fmla z31.h, p3/M, z20.h, z24.h\n"
+    "ld1h { z2.h }, p2/Z, [x9, x26, LSL #1]\n"
+    "addvl x9, x9, #1\n"
+    "fmla z26.h, p3/M, z20.h, z19.h\n"
+    "fmla z30.h, p3/M, z20.h, z5.h\n"
+    "ld1h { z16.h }, p3/Z, [x10, #7, MUL VL]\n"
+    "addvl x10, x10, #16\n"
+    "fmla z27.h, p3/M, z17.h, z24.h\n"
+    "fmla z31.h, p3/M, z17.h, z23.h\n"
+    "ld1h { z25.h }, p2/Z, [x27]\n"
+    "ld1h { z29.h }, p3/Z, [x10, #4, MUL VL]\n"
+    "fmla z26.h, p3/M, z17.h, z5.h\n"
+    "fmla z30.h, p3/M, z17.h, z2.h\n"
+    "ld1h { z17.h }, p3/Z, [x10, #-8, MUL VL]\n"
+    "fmla z27.h, p3/M, z21.h, z23.h\n"
+    "fmla z31.h, p3/M, z21.h, z10.h\n"
+    "ld1h { z24.h }, p2/Z, [x27, x17, LSL #1]\n"
+    "ld1h { z22.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z21.h, z2.h\n"
+    "fmla z30.h, p3/M, z21.h, z3.h\n"
+    "ld1h { z21.h }, p3/Z, [x10, #-7, MUL VL]\n"
+    "fmla z27.h, p3/M, z18.h, z14.h\n"
+    "fmla z31.h, p3/M, z18.h, z0.h\n"
+    "ld1h { z1.h }, p2/Z, [x27, x24, LSL #1]\n"
+    "fmla z26.h, p3/M, z18.h, z25.h\n"
+    "fmla z30.h, p3/M, z18.h, z24.h\n"
+    "ld1h { z23.h }, p3/Z, [x10, #-6, MUL VL]\n"
+    "fmla z27.h, p3/M, z8.h, z0.h\n"
+    "fmla z31.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z0.h }, p2/Z, [x27, x28, LSL #1]\n"
+    "fmla z26.h, p3/M, z8.h, z24.h\n"
+    "fmla z30.h, p3/M, z8.h, z22.h\n"
+    "ld1h { z20.h }, p3/Z, [x10, #-5, MUL VL]\n"
+    "fmla z27.h, p3/M, z16.h, z19.h\n"
+    "fmla z31.h, p3/M, z16.h, z5.h\n"
+    "ld1h { z19.h }, p2/Z, [x27, x26, LSL #1]\n"
+    "addvl x27, x27, #1\n"
+    "fmla z26.h, p3/M, z16.h, z22.h\n"
+    "fmla z30.h, p3/M, z16.h, z0.h\n"
+    "ld1h { z18.h }, p3/Z, [x10, #-4, MUL VL]\n"
+    "fmla z27.h, p3/M, z17.h, z5.h\n"
+    "fmla z31.h, p3/M, z17.h, z2.h\n"
+    "ld1h { z16.h }, p2/Z, [x25]\n"
+    "fmla z26.h, p3/M, z17.h, z0.h\n"
+    "fmla z30.h, p3/M, z17.h, z19.h\n"
+    "ld1h { z17.h }, p3/Z, [x10, #-3, MUL VL]\n"
+    "fmla z27.h, p3/M, z21.h, z2.h\n"
+    "fmla z31.h, p3/M, z21.h, z3.h\n"
+    "ld1h { z4.h }, p2/Z, [x25, x17, LSL #1]\n"
+    "ld1h { z8.h }, p2/Z, [x25, x26, LSL #1]\n"
+    "fmla z26.h, p3/M, z21.h, z19.h\n"
+    "fmla z30.h, p3/M, z21.h, z1.h\n"
+    "ld1h { z13.h }, p3/Z, [x10, #-2, MUL VL]\n"
+    "fmla z27.h, p3/M, z23.h, z25.h\n"
+    "fmla z31.h, p3/M, z23.h, z24.h\n"
+    "ld1h { z25.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z23.h, z16.h\n"
+    "fmla z30.h, p3/M, z23.h, z4.h\n"
+    "ld1h { z5.h }, p3/Z, [x10, #-1, MUL VL]\n"
+    "fmla z27.h, p3/M, z20.h, z24.h\n"
+    "fmla z31.h, p3/M, z20.h, z22.h\n"
+    "ld1h { z24.h }, p2/Z, [x25, x28, LSL #1]\n"
+    "fmla z26.h, p3/M, z20.h, z4.h\n"
+    "fmla z30.h, p3/M, z20.h, z25.h\n"
+    "ld1h { z23.h }, p3/Z, [x10]\n"
+    "fmla z27.h, p3/M, z18.h, z22.h\n"
+    "fmla z31.h, p3/M, z18.h, z0.h\n"
+    "ld1h { z22.h }, p2/Z, [x25, x24, LSL #1]\n"
+    "addvl x25, x25, #1\n"
+    "fmla z26.h, p3/M, z18.h, z25.h\n"
+    "fmla z30.h, p3/M, z18.h, z24.h\n"
+    "ld1h { z21.h }, p3/Z, [x10, #1, MUL VL]\n"
+    "fmla z27.h, p3/M, z17.h, z0.h\n"
+    "fmla z31.h, p3/M, z17.h, z19.h\n"
+    "ld1h { z18.h }, p2/Z, [x23]\n"
+    "fmla z26.h, p3/M, z17.h, z24.h\n"
+    "fmla z30.h, p3/M, z17.h, z8.h\n"
+    "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
+    "fmla z27.h, p3/M, z13.h, z19.h\n"
+    "fmla z31.h, p3/M, z13.h, z1.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x17, LSL #1]\n"
+    "ld1h { z14.h }, p1/Z, [x9]\n"
+    "fmla z26.h, p3/M, z13.h, z8.h\n"
+    "fmla z30.h, p3/M, z13.h, z22.h\n"
+    "ld1h { z19.h }, p3/Z, [x10, #3, MUL VL]\n"
+    "fmla z27.h, p3/M, z5.h, z16.h\n"
+    "fmla z31.h, p3/M, z5.h, z4.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z5.h, z18.h\n"
+    "fmla z30.h, p3/M, z5.h, z17.h\n"
+    "ld1h { z18.h }, p2/Z, [x23, x28, LSL #1]\n"
+    "ld1h { z0.h }, p3/Z, [x10, #5, MUL VL]\n"
+    "fmla z27.h, p3/M, z23.h, z4.h\n"
+    "fmla z31.h, p3/M, z23.h, z25.h\n"
+    "ld1h { z13.h }, p1/Z, [x11, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z23.h, z17.h\n"
+    "fmla z30.h, p3/M, z23.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z1.h }, p3/Z, [x10, #6, MUL VL]\n"
+    "fmla z27.h, p3/M, z21.h, z25.h\n"
+    "fmla z31.h, p3/M, z21.h, z24.h\n"
+    "ld1h { z5.h }, p1/Z, [x14]\n"
+    "fmla z26.h, p3/M, z21.h, z16.h\n"
+    "fmla z30.h, p3/M, z21.h, z18.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+    "ld1h { z2.h }, p3/Z, [x10, #7, MUL VL]\n"
+    "fmla z27.h, p3/M, z20.h, z24.h\n"
+    "fmla z31.h, p3/M, z20.h, z8.h\n"
+    "addvl x10, x10, #16\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "fmla z26.h, p3/M, z20.h, z18.h\n"
+    "fmla z30.h, p3/M, z20.h, z17.h\n"
+    "cmp x12, %x[n_channels]\n"
+    "addvl x23, x23, #1\n"
+    "fmla z27.h, p3/M, z19.h, z8.h\n"
+    "fmla z31.h, p3/M, z19.h, z22.h\n"
+    "fmax z27.h, p3/M, z27.h, z15.h\n"
+    "fmax z31.h, p3/M, z31.h, z15.h\n"
+    "fmla z26.h, p3/M, z19.h, z17.h\n"
+    "fmla z30.h, p3/M, z19.h, z16.h\n"
+    "fmax z26.h, p3/M, z26.h, z15.h\n"
+    "fmax z30.h, p3/M, z30.h, z15.h\n"
+    "fmin z27.h, p3/M, z27.h, z28.h\n"
+    "fmin z31.h, p3/M, z31.h, z28.h\n"
+    "ld1h { z6.h }, p1/Z, [x14, x17, LSL #1]\n"
+    "ld1h { z8.h }, p1/Z, [x11, x17, LSL #1]\n"
+    "fmin z26.h, p3/M, z26.h, z28.h\n"
+    "fmin z30.h, p3/M, z30.h, z28.h\n"
+    "ld1h { z9.h }, p1/Z, [x14, x15, LSL #1]\n"
+    "ld1h { z11.h }, p1/Z, [x14, x28, LSL #1]\n"
+    "ld1h { z12.h }, p1/Z, [x14, x26, LSL #1]\n"
+    "ld1h { z10.h }, p1/Z, [x11, x24, LSL #1]\n"
+    "st1h { z27.h }, p0, [x13]\n"
+    "st1h { z31.h }, p0, [x13, x16, LSL #1]\n"
+    "addvl x13, x13, #1\n"
+    "ld1h { z3.h }, p3/Z, [x10, #-8, MUL VL]\n"
+    "ld1h { z4.h }, p3/Z, [x10, #-7, MUL VL]\n"
+    "st1h { z26.h }, p0, [x22]\n"
+    "addvl x10, x10, #-6\n"
+    "st1h { z30.h }, p0, [x22, x16, LSL #1]\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z30, z29\n fmla z30.h, p3/M, z0.h, z5.h\n"
+    "movprfx z31, z29\n fmla z31.h, p3/M, z0.h, z6.h\n"
+    "ld1h { z22.h }, p2/Z, [x11, x28, LSL #1]\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "movprfx z5, z29\n fmla z5.h, p3/M, z0.h, z7.h\n"
+    "fmla z29.h, p3/M, z0.h, z8.h\n"
+    "ld1h { z20.h }, p3/Z, [x10]\n"
+    "ldr x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "fmla z30.h, p3/M, z1.h, z6.h\n"
+    "fmla z31.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z6.h }, p2/Z, [x11, x26, LSL #1]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "fmla z5.h, p3/M, z1.h, z8.h\n"
+    "fmla z29.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z19.h }, p3/Z, [x10, #1, MUL VL]\n"
+    "add x8, x8, #0x1\n"
+    "fmla z30.h, p3/M, z2.h, z9.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z16.h }, p2/Z, [x14, x24, LSL #1]\n"
+    "cmp x8, x20\n"
+    "fmla z5.h, p3/M, z2.h, z13.h\n"
+    "fmla z29.h, p3/M, z2.h, z22.h\n"
+    "ld1h { z18.h }, p3/Z, [x10, #2, MUL VL]\n"
+    "add x21, x12, #0x1\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z1.h }, p2/Z, [x9, x17, LSL #1]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z5.h, p3/M, z3.h, z22.h\n"
+    "fmla z29.h, p3/M, z3.h, z6.h\n"
+    "ld1h { z17.h }, p3/Z, [x10, #3, MUL VL]\n"
+    "csel x12, x12, x21, LT\n"
+    "fmla z30.h, p3/M, z4.h, z12.h\n"
+    "fmla z31.h, p3/M, z4.h, z16.h\n"
+    "ld1h { z0.h }, p2/Z, [x9, x15, LSL #1]\n"
+    "ld1h { z27.h }, p2/Z, [x9, x28, LSL #1]\n"
+    "fmla z5.h, p3/M, z4.h, z6.h\n"
+    "fmla z29.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z16.h }, p3/Z, [x10, #4, MUL VL]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.h, p3/M, z20.h, z7.h\n"
+    "fmla z31.h, p3/M, z20.h, z8.h\n"
+    "csel x8, x8, XZR, LT\n"
+    "cmp x12, x20\n"
+    "fmla z5.h, p3/M, z20.h, z14.h\n"
+    "fmla z29.h, p3/M, z20.h, z1.h\n"
+    "ld1h { z21.h }, p3/Z, [x10, #5, MUL VL]\n"
+    "fmla z30.h, p3/M, z19.h, z8.h\n"
+    "fmla z31.h, p3/M, z19.h, z13.h\n"
+    "ld1h { z26.h }, p2/Z, [x9, x24, LSL #1]\n"
+    "fmla z5.h, p3/M, z19.h, z1.h\n"
+    "fmla z29.h, p3/M, z19.h, z0.h\n"
+    "ld1h { z25.h }, p3/Z, [x10, #6, MUL VL]\n"
+    "fmla z30.h, p3/M, z18.h, z13.h\n"
+    "fmla z31.h, p3/M, z18.h, z22.h\n"
+    "ld1h { z24.h }, p2/Z, [x9, x26, LSL #1]\n"
+    "fmla z5.h, p3/M, z18.h, z0.h\n"
+    "fmla z29.h, p3/M, z18.h, z27.h\n"
+    "ld1h { z23.h }, p3/Z, [x10, #7, MUL VL]\n"
+    "addvl x10, x10, #16\n"
+    "fmla z30.h, p3/M, z17.h, z22.h\n"
+    "fmla z31.h, p3/M, z17.h, z6.h\n"
+    "ld1h { z22.h }, p2/Z, [x27]\n"
+    "fmla z5.h, p3/M, z17.h, z27.h\n"
+    "fmla z29.h, p3/M, z17.h, z24.h\n"
+    "ld1h { z20.h }, p3/Z, [x10, #-8, MUL VL]\n"
+    "fmla z30.h, p3/M, z16.h, z6.h\n"
+    "fmla z31.h, p3/M, z16.h, z10.h\n"
+    "ld1h { z19.h }, p2/Z, [x27, x17, LSL #1]\n"
+    "ld1h { z18.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "fmla z5.h, p3/M, z16.h, z24.h\n"
+    "fmla z29.h, p3/M, z16.h, z26.h\n"
+    "ld1h { z16.h }, p3/Z, [x10, #-7, MUL VL]\n"
+    "fmla z30.h, p3/M, z21.h, z14.h\n"
+    "fmla z31.h, p3/M, z21.h, z1.h\n"
+    "ld1h { z17.h }, p2/Z, [x27, x24, LSL #1]\n"
+    "fmla z5.h, p3/M, z21.h, z22.h\n"
+    "fmla z29.h, p3/M, z21.h, z19.h\n"
+    "ld1h { z21.h }, p3/Z, [x10, #-6, MUL VL]\n"
+    "fmla z30.h, p3/M, z25.h, z1.h\n"
+    "fmla z31.h, p3/M, z25.h, z0.h\n"
+    "ld1h { z7.h }, p2/Z, [x27, x28, LSL #1]\n"
+    "fmla z5.h, p3/M, z25.h, z19.h\n"
+    "fmla z29.h, p3/M, z25.h, z18.h\n"
+    "ld1h { z10.h }, p3/Z, [x10, #-5, MUL VL]\n"
+    "fmla z30.h, p3/M, z23.h, z0.h\n"
+    "fmla z31.h, p3/M, z23.h, z27.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x26, LSL #1]\n"
+    "fmla z5.h, p3/M, z23.h, z18.h\n"
+    "fmla z29.h, p3/M, z23.h, z7.h\n"
+    "ld1h { z6.h }, p3/Z, [x10, #-4, MUL VL]\n"
+    "fmla z30.h, p3/M, z20.h, z27.h\n"
+    "fmla z31.h, p3/M, z20.h, z24.h\n"
+    "ld1h { z0.h }, p2/Z, [x25]\n"
+    "fmla z5.h, p3/M, z20.h, z7.h\n"
+    "fmla z29.h, p3/M, z20.h, z11.h\n"
+    "ld1h { z9.h }, p3/Z, [x10, #-3, MUL VL]\n"
+    "fmla z30.h, p3/M, z16.h, z24.h\n"
+    "fmla z31.h, p3/M, z16.h, z26.h\n"
+    "ld1h { z3.h }, p2/Z, [x25, x17, LSL #1]\n"
+    "ld1h { z27.h }, p2/Z, [x25, x26, LSL #1]\n"
+    "fmla z5.h, p3/M, z16.h, z11.h\n"
+    "fmla z29.h, p3/M, z16.h, z17.h\n"
+    "ld1h { z16.h }, p3/Z, [x10, #-2, MUL VL]\n"
+    "fmla z30.h, p3/M, z21.h, z22.h\n"
+    "fmla z31.h, p3/M, z21.h, z19.h\n"
+    "ld1h { z26.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z5.h, p3/M, z21.h, z0.h\n"
+    "fmla z29.h, p3/M, z21.h, z3.h\n"
+    "ld1h { z25.h }, p3/Z, [x10, #-1, MUL VL]\n"
+    "fmla z30.h, p3/M, z10.h, z19.h\n"
+    "fmla z31.h, p3/M, z10.h, z18.h\n"
+    "ld1h { z24.h }, p2/Z, [x25, x28, LSL #1]\n"
+    "fmla z5.h, p3/M, z10.h, z3.h\n"
+    "fmla z29.h, p3/M, z10.h, z26.h\n"
+    "ld1h { z23.h }, p3/Z, [x10]\n"
+    "fmla z30.h, p3/M, z6.h, z18.h\n"
+    "fmla z31.h, p3/M, z6.h, z7.h\n"
+    "ld1h { z22.h }, p2/Z, [x25, x24, LSL #1]\n"
+    "fmla z5.h, p3/M, z6.h, z26.h\n"
+    "fmla z29.h, p3/M, z6.h, z24.h\n"
+    "ld1h { z21.h }, p3/Z, [x10, #1, MUL VL]\n"
+    "fmla z30.h, p3/M, z9.h, z7.h\n"
+    "fmla z31.h, p3/M, z9.h, z11.h\n"
+    "ld1h { z18.h }, p2/Z, [x23]\n"
+    "fmla z5.h, p3/M, z9.h, z24.h\n"
+    "fmla z29.h, p3/M, z9.h, z27.h\n"
+    "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
+    "fmla z30.h, p3/M, z16.h, z11.h\n"
+    "fmla z31.h, p3/M, z16.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x17, LSL #1]\n"
+    "fmla z5.h, p3/M, z16.h, z27.h\n"
+    "fmla z29.h, p3/M, z16.h, z22.h\n"
+    "ld1h { z19.h }, p3/Z, [x10, #3, MUL VL]\n"
+    "fmla z30.h, p3/M, z25.h, z0.h\n"
+    "fmla z31.h, p3/M, z25.h, z3.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z5.h, p3/M, z25.h, z18.h\n"
+    "fmla z29.h, p3/M, z25.h, z17.h\n"
+    "ld1h { z18.h }, p2/Z, [x23, x28, LSL #1]\n"
+    "fmla z30.h, p3/M, z23.h, z3.h\n"
+    "fmla z31.h, p3/M, z23.h, z26.h\n"
+    "fmla z5.h, p3/M, z23.h, z17.h\n"
+    "fmla z29.h, p3/M, z23.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x26, LSL #1]\n"
+    "fmla z30.h, p3/M, z21.h, z26.h\n"
+    "fmla z31.h, p3/M, z21.h, z24.h\n"
+    "fmla z5.h, p3/M, z21.h, z16.h\n"
+    "fmla z29.h, p3/M, z21.h, z18.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+    "fmla z30.h, p3/M, z20.h, z24.h\n"
+    "fmla z31.h, p3/M, z20.h, z27.h\n"
+    "fmla z5.h, p3/M, z20.h, z18.h\n"
+    "fmla z29.h, p3/M, z20.h, z17.h\n"
+    "fmla z30.h, p3/M, z19.h, z27.h\n"
+    "fmla z31.h, p3/M, z19.h, z22.h\n"
+    "fmax z30.h, p3/M, z30.h, z15.h\n"
+    "fmax z31.h, p3/M, z31.h, z15.h\n"
+    "fmla z5.h, p3/M, z19.h, z17.h\n"
+    "fmla z29.h, p3/M, z19.h, z16.h\n"
+    "fmax z5.h, p3/M, z5.h, z15.h\n"
+    "fmax z29.h, p3/M, z29.h, z15.h\n"
+    "fmin z30.h, p3/M, z30.h, z28.h\n"
+    "fmin z31.h, p3/M, z31.h, z28.h\n"
+    "st1h { z30.h }, p0, [x13]\n"
+    "fmin z5.h, p3/M, z5.h, z28.h\n"
+    "fmin z29.h, p3/M, z29.h, z28.h\n"
+    "st1h { z31.h }, p0, [x13, x16, LSL #1]\n"
+    "st1h { z5.h }, p0, [x22]\n"
+    "st1h { z29.h }, p0, [x22, x16, LSL #1]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..1ec0cb2cbf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[36];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[0];
+      inptrs[1] = input_ptrs[1];
+      inptrs[2] = input_ptrs[6];
+      inptrs[3] = input_ptrs[7];
+      inptrs[4] = input_ptrs[2];
+      inptrs[5] = input_ptrs[8];
+      inptrs[6] = input_ptrs[3];
+      inptrs[7] = input_ptrs[4];
+      inptrs[8] = input_ptrs[11];
+      inptrs[9] = input_ptrs[12];
+      inptrs[10] = input_ptrs[9];
+      inptrs[11] = input_ptrs[10];
+      inptrs[12] = input_ptrs[5];
+      inptrs[13] = input_ptrs[13];
+      inptrs[14] = input_ptrs[14];
+      inptrs[15] = input_ptrs[15];
+      inptrs[16] = input_ptrs[16];
+      inptrs[17] = input_ptrs[17];
+      inptrs[18] = input_ptrs[18];
+      inptrs[19] = input_ptrs[19];
+      inptrs[20] = input_ptrs[20];
+      inptrs[21] = input_ptrs[21];
+      inptrs[22] = input_ptrs[22];
+      inptrs[23] = input_ptrs[23];
+      inptrs[24] = input_ptrs[24];
+      inptrs[25] = input_ptrs[25];
+      inptrs[26] = input_ptrs[26];
+      inptrs[27] = input_ptrs[27];
+      inptrs[28] = input_ptrs[28];
+      inptrs[29] = input_ptrs[29];
+      inptrs[30] = input_ptrs[30];
+      inptrs[31] = input_ptrs[31];
+      inptrs[32] = input_ptrs[32];
+      inptrs[33] = input_ptrs[33];
+      inptrs[34] = input_ptrs[34];
+      inptrs[35] = input_ptrs[35];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldp x15, x14, [x20, #0x0]\n"
+    "mov x13, #0x0\n"
+    "ldp x12, x11, [x20, #0x10]\n"
+    "whilelt p3.h, XZR, %x[n_channels]\n"
+    "ldp x21, x20, [x16, #0x0]\n"
+    "cnth x10\n"
+    "ptrue p2.b\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_params]]\n"
+    "ld1h { z5.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "cmp x10, %x[n_channels]\n"
+    "ld1h { z6.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldp x27, x26, [x16, #0x10]\n"
+    "sub x28, XZR, x10\n"
+    "ldp x25, x24, [x16, #0x20]\n"
+    "ldp x23, x22, [x16, #0x30]\n"
+    "ldp x21, x20, [x16, #0x40]\n"
+    "ld1rh { z15.h }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rh { z28.h }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z29.h }, p2/Z, [x9]\n"
+    "ld1h { z0.h }, p2/Z, [x9, #1, MUL VL]\n"
+    "ld1h { z1.h }, p2/Z, [x9, #2, MUL VL]\n"
+    "ld1h { z2.h }, p2/Z, [x9, #3, MUL VL]\n"
+    "ld1h { z3.h }, p2/Z, [x9, #4, MUL VL]\n"
+    "ld1h { z4.h }, p2/Z, [x9, #5, MUL VL]\n"
+    "ld1h { z7.h }, p3/Z, [x27, x13, LSL #1]\n"
+    "addvl x9, x9, #6\n"
+    "ld1h { z8.h }, p3/Z, [x26, x13, LSL #1]\n"
+    "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
+    "ld1h { z13.h }, p3/Z, [x24, x13, LSL #1]\n"
+    "ld1h { z11.h }, p3/Z, [x23, x13, LSL #1]\n"
+    "ld1h { z12.h }, p3/Z, [x22, x13, LSL #1]\n"
+    "ld1h { z10.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "ld1h { z14.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z30, z29\n fmla z30.h, p2/M, z0.h, z5.h\n"
+    "movprfx z27, z29\n fmla z27.h, p2/M, z0.h, z6.h\n"
+    "ldr x20, [x16, #0x50]\n"
+    "ld1h { z5.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "movprfx z31, z29\n fmla z31.h, p2/M, z0.h, z7.h\n"
+    "movprfx z26, z29\n fmla z26.h, p2/M, z0.h, z8.h\n"
+    "ldr x20, [x16, #0x58]\n"
+    "ldr x21, [x16, #0x60]\n"
+    "fmla z30.h, p2/M, z1.h, z6.h\n"
+    "fmla z27.h, p2/M, z1.h, z9.h\n"
+    "ld1h { z22.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldr x20, [x16, #0x68]\n"
+    "fmla z31.h, p2/M, z1.h, z8.h\n"
+    "fmla z26.h, p2/M, z1.h, z13.h\n"
+    "ld1h { z21.h }, p2/Z, [x9]\n"
+    "ldr x23, [x16, #0x70]\n"
+    "fmla z30.h, p2/M, z2.h, z9.h\n"
+    "fmla z27.h, p2/M, z2.h, z11.h\n"
+    "ld1h { z20.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "ld1h { z18.h }, p2/Z, [x9, #1, MUL VL]\n"
+    "fmla z31.h, p2/M, z2.h, z13.h\n"
+    "fmla z26.h, p2/M, z2.h, z5.h\n"
+    "ldr x22, [x16, #0x78]\n"
+    "ld1h { z17.h }, p2/Z, [x9, #2, MUL VL]\n"
+    "fmla z30.h, p2/M, z3.h, z11.h\n"
+    "fmla z27.h, p2/M, z3.h, z12.h\n"
+    "ld1h { z11.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldr x21, [x16, #0x80]\n"
+    "fmla z31.h, p2/M, z3.h, z5.h\n"
+    "fmla z26.h, p2/M, z3.h, z22.h\n"
+    "ld1h { z16.h }, p2/Z, [x9, #3, MUL VL]\n"
+    "ldr x20, [x16, #0x88]\n"
+    "fmla z30.h, p2/M, z4.h, z12.h\n"
+    "fmla z27.h, p2/M, z4.h, z20.h\n"
+    "ld1h { z0.h }, p3/Z, [x23, x13, LSL #1]\n"
+    "ld1h { z29.h }, p3/Z, [x22, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z4.h, z22.h\n"
+    "fmla z26.h, p2/M, z4.h, z10.h\n"
+    "ld1h { z19.h }, p2/Z, [x9, #4, MUL VL]\n"
+    "ldr x23, [x16, #0x90]\n"
+    "fmla z30.h, p2/M, z21.h, z7.h\n"
+    "fmla z27.h, p2/M, z21.h, z8.h\n"
+    "ldr x26, [x16, #0x98]\n"
+    "ldr x22, [x16, #0xa0]\n"
+    "fmla z31.h, p2/M, z21.h, z14.h\n"
+    "fmla z26.h, p2/M, z21.h, z11.h\n"
+    "ld1h { z25.h }, p2/Z, [x9, #5, MUL VL]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla z30.h, p2/M, z18.h, z8.h\n"
+    "fmla z27.h, p2/M, z18.h, z13.h\n"
+    "ld1h { z24.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla z31.h, p2/M, z18.h, z11.h\n"
+    "fmla z26.h, p2/M, z18.h, z0.h\n"
+    "ld1h { z18.h }, p2/Z, [x9, #6, MUL VL]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z30.h, p2/M, z17.h, z13.h\n"
+    "fmla z27.h, p2/M, z17.h, z5.h\n"
+    "ld1h { z3.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "ldr x21, [x16, #0xc0]\n"
+    "fmla z31.h, p2/M, z17.h, z0.h\n"
+    "fmla z26.h, p2/M, z17.h, z29.h\n"
+    "ld1h { z17.h }, p2/Z, [x9, #7, MUL VL]\n"
+    "addvl x9, x9, #16\n"
+    "fmla z30.h, p2/M, z16.h, z5.h\n"
+    "fmla z27.h, p2/M, z16.h, z22.h\n"
+    "ld1h { z6.h }, p3/Z, [x23, x13, LSL #1]\n"
+    "ldr x27, [x16, #0xc8]\n"
+    "fmla z31.h, p2/M, z16.h, z29.h\n"
+    "fmla z26.h, p2/M, z16.h, z3.h\n"
+    "ld1h { z16.h }, p2/Z, [x9, #-8, MUL VL]\n"
+    "ldr x23, [x16, #0xd0]\n"
+    "fmla z30.h, p2/M, z19.h, z22.h\n"
+    "fmla z27.h, p2/M, z19.h, z10.h\n"
+    "ld1h { z23.h }, p3/Z, [x26, x13, LSL #1]\n"
+    "ld1h { z22.h }, p3/Z, [x22, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z19.h, z3.h\n"
+    "fmla z26.h, p2/M, z19.h, z24.h\n"
+    "ld1h { z21.h }, p2/Z, [x9, #-7, MUL VL]\n"
+    "ldr x22, [x16, #0xd8]\n"
+    "fmla z30.h, p2/M, z25.h, z14.h\n"
+    "fmla z27.h, p2/M, z25.h, z11.h\n"
+    "ld1h { z1.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldr x20, [x16, #0xe0]\n"
+    "fmla z31.h, p2/M, z25.h, z6.h\n"
+    "fmla z26.h, p2/M, z25.h, z23.h\n"
+    "ld1h { z20.h }, p2/Z, [x9, #-6, MUL VL]\n"
+    "ldr x26, [x16, #0xf8]\n"
+    "fmla z30.h, p2/M, z18.h, z11.h\n"
+    "fmla z27.h, p2/M, z18.h, z0.h\n"
+    "ld1h { z7.h }, p3/Z, [x25, x13, LSL #1]\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla z31.h, p2/M, z18.h, z23.h\n"
+    "fmla z26.h, p2/M, z18.h, z22.h\n"
+    "ld1h { z18.h }, p2/Z, [x9, #-5, MUL VL]\n"
+    "whilelt p1.h, x10, %x[n_channels]\n"
+    "fmla z30.h, p2/M, z17.h, z0.h\n"
+    "fmla z27.h, p2/M, z17.h, z29.h\n"
+    "ld1h { z19.h }, p3/Z, [x24, x13, LSL #1]\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla z31.h, p2/M, z17.h, z22.h\n"
+    "fmla z26.h, p2/M, z17.h, z7.h\n"
+    "ld1h { z17.h }, p2/Z, [x9, #-4, MUL VL]\n"
+    "inch x28\n"
+    "fmla z30.h, p2/M, z16.h, z29.h\n"
+    "fmla z27.h, p2/M, z16.h, z3.h\n"
+    "ld1h { z0.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "ldr x21, [x16, #0x100]\n"
+    "fmla z31.h, p2/M, z16.h, z7.h\n"
+    "fmla z26.h, p2/M, z16.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x9, #-3, MUL VL]\n"
+    "mov p0.b, p3.b\n"
+    "fmla z30.h, p2/M, z21.h, z3.h\n"
+    "fmla z27.h, p2/M, z21.h, z24.h\n"
+    "ld1h { z11.h }, p3/Z, [x27, x13, LSL #1]\n"
+    "ld1h { z13.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z21.h, z19.h\n"
+    "fmla z26.h, p2/M, z21.h, z1.h\n"
+    "ld1h { z10.h }, p2/Z, [x9, #-2, MUL VL]\n"
+    "ldr x20, [x16, #0x108]\n"
+    "fmla z30.h, p2/M, z20.h, z6.h\n"
+    "fmla z27.h, p2/M, z20.h, z23.h\n"
+    "ld1h { z25.h }, p3/Z, [x23, x13, LSL #1]\n"
+    "ldr x23, [x16, #0x110]\n"
+    "fmla z31.h, p2/M, z20.h, z0.h\n"
+    "fmla z26.h, p2/M, z20.h, z11.h\n"
+    "ld1h { z8.h }, p2/Z, [x9, #-1, MUL VL]\n"
+    "ld1h { z29.h }, p2/Z, [x9, #4, MUL VL]\n"
+    "fmla z30.h, p2/M, z18.h, z23.h\n"
+    "fmla z27.h, p2/M, z18.h, z22.h\n"
+    "ld1h { z24.h }, p3/Z, [x22, x13, LSL #1]\n"
+    "ldr x22, [x16, #0x118]\n"
+    "fmla z31.h, p2/M, z18.h, z11.h\n"
+    "fmla z26.h, p2/M, z18.h, z25.h\n"
+    "ld1h { z23.h }, p2/Z, [x9]\n"
+    "fmla z30.h, p2/M, z17.h, z22.h\n"
+    "fmla z27.h, p2/M, z17.h, z7.h\n"
+    "ld1h { z22.h }, p3/Z, [x25, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z17.h, z25.h\n"
+    "fmla z26.h, p2/M, z17.h, z24.h\n"
+    "ld1h { z21.h }, p2/Z, [x9, #1, MUL VL]\n"
+    "fmla z30.h, p2/M, z16.h, z7.h\n"
+    "fmla z27.h, p2/M, z16.h, z19.h\n"
+    "ld1h { z18.h }, p3/Z, [x24, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z16.h, z24.h\n"
+    "fmla z26.h, p2/M, z16.h, z13.h\n"
+    "ld1h { z20.h }, p2/Z, [x9, #2, MUL VL]\n"
+    "fmla z30.h, p2/M, z10.h, z19.h\n"
+    "fmla z27.h, p2/M, z10.h, z1.h\n"
+    "ld1h { z17.h }, p3/Z, [x26, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z10.h, z13.h\n"
+    "fmla z26.h, p2/M, z10.h, z22.h\n"
+    "ld1h { z19.h }, p2/Z, [x9, #3, MUL VL]\n"
+    "fmla z30.h, p2/M, z8.h, z0.h\n"
+    "fmla z27.h, p2/M, z8.h, z11.h\n"
+    "ld1h { z16.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z8.h, z18.h\n"
+    "fmla z26.h, p2/M, z8.h, z17.h\n"
+    "ld1h { z18.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldp x21, x20, [x16, #0x0]\n"
+    "fmla z30.h, p2/M, z23.h, z11.h\n"
+    "fmla z27.h, p2/M, z23.h, z25.h\n"
+    "ld1h { z0.h }, p2/Z, [x9, #5, MUL VL]\n"
+    "fmla z31.h, p2/M, z23.h, z17.h\n"
+    "fmla z26.h, p2/M, z23.h, z16.h\n"
+    "ld1h { z17.h }, p3/Z, [x23, x13, LSL #1]\n"
+    "ld1h { z1.h }, p2/Z, [x9, #6, MUL VL]\n"
+    "fmla z30.h, p2/M, z21.h, z25.h\n"
+    "fmla z27.h, p2/M, z21.h, z24.h\n"
+    "ld1h { z5.h }, p1/Z, [x21, x10, LSL #1]\n"
+    "fmla z31.h, p2/M, z21.h, z16.h\n"
+    "fmla z26.h, p2/M, z21.h, z18.h\n"
+    "ld1h { z16.h }, p3/Z, [x22, x13, LSL #1]\n"
+    "ldp x27, x26, [x16, #0x10]\n"
+    "fmla z30.h, p2/M, z20.h, z24.h\n"
+    "fmla z27.h, p2/M, z20.h, z13.h\n"
+    "ld1h { z6.h }, p1/Z, [x20, x10, LSL #1]\n"
+    "ldp x25, x24, [x16, #0x20]\n"
+    "fmla z31.h, p2/M, z20.h, z18.h\n"
+    "fmla z26.h, p2/M, z20.h, z17.h\n"
+    "ldp x23, x22, [x16, #0x30]\n"
+    "ldp x21, x20, [x16, #0x40]\n"
+    "fmla z30.h, p2/M, z19.h, z13.h\n"
+    "fmla z27.h, p2/M, z19.h, z22.h\n"
+    "inch x13\n"
+    "ld1h { z7.h }, p1/Z, [x27, x10, LSL #1]\n"
+    "fmla z31.h, p2/M, z19.h, z17.h\n"
+    "fmla z26.h, p2/M, z19.h, z16.h\n"
+    "ld1h { z8.h }, p1/Z, [x26, x10, LSL #1]\n"
+    "ld1h { z9.h }, p1/Z, [x25, x10, LSL #1]\n"
+    "ld1h { z13.h }, p1/Z, [x24, x10, LSL #1]\n"
+    "ld1h { z11.h }, p1/Z, [x23, x10, LSL #1]\n"
+    "fmax z30.h, p2/M, z30.h, z15.h\n"
+    "fmax z27.h, p2/M, z27.h, z15.h\n"
+    "ld1h { z12.h }, p1/Z, [x22, x10, LSL #1]\n"
+    "ld1h { z10.h }, p1/Z, [x21, x10, LSL #1]\n"
+    "fmax z31.h, p2/M, z31.h, z15.h\n"
+    "fmax z26.h, p2/M, z26.h, z15.h\n"
+    "ld1h { z14.h }, p1/Z, [x20, x10, LSL #1]\n"
+    "inch x10\n"
+    "ld1h { z2.h }, p2/Z, [x9, #7, MUL VL]\n"
+    "addvl x9, x9, #16\n"
+    "whilelt p3.h, x13, %x[n_channels]\n"
+    "cmp x10, %x[n_channels]\n"
+    "ld1h { z3.h }, p2/Z, [x9, #-8, MUL VL]\n"
+    "ld1h { z4.h }, p2/Z, [x9, #-7, MUL VL]\n"
+    "fmin z30.h, p2/M, z30.h, z28.h\n"
+    "fmin z27.h, p2/M, z27.h, z28.h\n"
+    "st1h { z30.h }, p0, [x15, x28, LSL #1]\n"
+    "fmin z31.h, p2/M, z31.h, z28.h\n"
+    "fmin z26.h, p2/M, z26.h, z28.h\n"
+    "st1h { z27.h }, p0, [x14, x28, LSL #1]\n"
+    "st1h { z31.h }, p0, [x12, x28, LSL #1]\n"
+    "addvl x9, x9, #-6\n"
+    "st1h { z26.h }, p0, [x11, x28, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z30, z29\n fmla z30.h, p2/M, z0.h, z5.h\n"
+    "movprfx z31, z29\n fmla z31.h, p2/M, z0.h, z6.h\n"
+    "ldr x20, [x16, #0x50]\n"
+    "ld1h { z22.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "movprfx z5, z29\n fmla z5.h, p2/M, z0.h, z7.h\n"
+    "fmla z29.h, p2/M, z0.h, z8.h\n"
+    "ldr x20, [x16, #0x58]\n"
+    "ldr x21, [x16, #0x60]\n"
+    "fmla z30.h, p2/M, z1.h, z6.h\n"
+    "fmla z31.h, p2/M, z1.h, z9.h\n"
+    "ld1h { z6.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldr x20, [x16, #0x68]\n"
+    "fmla z5.h, p2/M, z1.h, z8.h\n"
+    "fmla z29.h, p2/M, z1.h, z13.h\n"
+    "ld1h { z20.h }, p2/Z, [x9]\n"
+    "ldr x23, [x16, #0x70]\n"
+    "fmla z30.h, p2/M, z2.h, z9.h\n"
+    "fmla z31.h, p2/M, z2.h, z11.h\n"
+    "ld1h { z16.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "ld1h { z19.h }, p2/Z, [x9, #1, MUL VL]\n"
+    "fmla z5.h, p2/M, z2.h, z13.h\n"
+    "fmla z29.h, p2/M, z2.h, z22.h\n"
+    "ldr x21, [x16, #0x78]\n"
+    "ld1h { z18.h }, p2/Z, [x9, #2, MUL VL]\n"
+    "fmla z30.h, p2/M, z3.h, z11.h\n"
+    "fmla z31.h, p2/M, z3.h, z12.h\n"
+    "ld1h { z1.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldr x22, [x16, #0x80]\n"
+    "fmla z5.h, p2/M, z3.h, z22.h\n"
+    "fmla z29.h, p2/M, z3.h, z6.h\n"
+    "ld1h { z17.h }, p2/Z, [x9, #3, MUL VL]\n"
+    "ldr x20, [x16, #0x88]\n"
+    "fmla z30.h, p2/M, z4.h, z12.h\n"
+    "fmla z31.h, p2/M, z4.h, z16.h\n"
+    "ld1h { z0.h }, p3/Z, [x23, x13, LSL #1]\n"
+    "ld1h { z27.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "fmla z5.h, p2/M, z4.h, z6.h\n"
+    "fmla z29.h, p2/M, z4.h, z10.h\n"
+    "ld1h { z16.h }, p2/Z, [x9, #4, MUL VL]\n"
+    "ldr x21, [x16, #0x90]\n"
+    "fmla z30.h, p2/M, z20.h, z7.h\n"
+    "fmla z31.h, p2/M, z20.h, z8.h\n"
+    "ldr x27, [x16, #0x98]\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla z5.h, p2/M, z20.h, z14.h\n"
+    "fmla z29.h, p2/M, z20.h, z1.h\n"
+    "ld1h { z21.h }, p2/Z, [x9, #5, MUL VL]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla z30.h, p2/M, z19.h, z8.h\n"
+    "fmla z31.h, p2/M, z19.h, z13.h\n"
+    "ld1h { z26.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla z5.h, p2/M, z19.h, z1.h\n"
+    "fmla z29.h, p2/M, z19.h, z0.h\n"
+    "ld1h { z25.h }, p2/Z, [x9, #6, MUL VL]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z30.h, p2/M, z18.h, z13.h\n"
+    "fmla z31.h, p2/M, z18.h, z22.h\n"
+    "ld1h { z24.h }, p3/Z, [x22, x13, LSL #1]\n"
+    "ldr x23, [x16, #0xc0]\n"
+    "fmla z5.h, p2/M, z18.h, z0.h\n"
+    "fmla z29.h, p2/M, z18.h, z27.h\n"
+    "ld1h { z23.h }, p2/Z, [x9, #7, MUL VL]\n"
+    "addvl x9, x9, #16\n"
+    "fmla z30.h, p2/M, z17.h, z22.h\n"
+    "fmla z31.h, p2/M, z17.h, z6.h\n"
+    "ld1h { z22.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "ldr x22, [x16, #0xc8]\n"
+    "fmla z5.h, p2/M, z17.h, z27.h\n"
+    "fmla z29.h, p2/M, z17.h, z24.h\n"
+    "ld1h { z20.h }, p2/Z, [x9, #-8, MUL VL]\n"
+    "ldr x21, [x16, #0xd0]\n"
+    "fmla z30.h, p2/M, z16.h, z6.h\n"
+    "fmla z31.h, p2/M, z16.h, z10.h\n"
+    "ld1h { z19.h }, p3/Z, [x27, x13, LSL #1]\n"
+    "ld1h { z18.h }, p3/Z, [x26, x13, LSL #1]\n"
+    "fmla z5.h, p2/M, z16.h, z24.h\n"
+    "fmla z29.h, p2/M, z16.h, z26.h\n"
+    "ld1h { z16.h }, p2/Z, [x9, #-7, MUL VL]\n"
+    "ldr x27, [x16, #0xd8]\n"
+    "fmla z30.h, p2/M, z21.h, z14.h\n"
+    "fmla z31.h, p2/M, z21.h, z1.h\n"
+    "ld1h { z17.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldr x20, [x16, #0xe0]\n"
+    "fmla z5.h, p2/M, z21.h, z22.h\n"
+    "fmla z29.h, p2/M, z21.h, z19.h\n"
+    "ld1h { z21.h }, p2/Z, [x9, #-6, MUL VL]\n"
+    "ldr x26, [x16, #0xf8]\n"
+    "fmla z30.h, p2/M, z25.h, z1.h\n"
+    "fmla z31.h, p2/M, z25.h, z0.h\n"
+    "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla z5.h, p2/M, z25.h, z19.h\n"
+    "fmla z29.h, p2/M, z25.h, z18.h\n"
+    "ld1h { z4.h }, p2/Z, [x9, #-5, MUL VL]\n"
+    "inch x28\n"
+    "fmla z30.h, p2/M, z23.h, z0.h\n"
+    "fmla z31.h, p2/M, z23.h, z27.h\n"
+    "ld1h { z8.h }, p3/Z, [x24, x13, LSL #1]\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla z5.h, p2/M, z23.h, z18.h\n"
+    "fmla z29.h, p2/M, z23.h, z9.h\n"
+    "ld1h { z6.h }, p2/Z, [x9, #-4, MUL VL]\n"
+    "mov p0.b, p3.b\n"
+    "fmla z30.h, p2/M, z20.h, z27.h\n"
+    "fmla z31.h, p2/M, z20.h, z24.h\n"
+    "ld1h { z10.h }, p3/Z, [x23, x13, LSL #1]\n"
+    "ldr x23, [x16, #0x100]\n"
+    "fmla z5.h, p2/M, z20.h, z9.h\n"
+    "fmla z29.h, p2/M, z20.h, z8.h\n"
+    "ld1h { z11.h }, p2/Z, [x9, #-3, MUL VL]\n"
+    "fmla z30.h, p2/M, z16.h, z24.h\n"
+    "fmla z31.h, p2/M, z16.h, z26.h\n"
+    "ld1h { z0.h }, p3/Z, [x22, x13, LSL #1]\n"
+    "ld1h { z27.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "fmla z5.h, p2/M, z16.h, z8.h\n"
+    "fmla z29.h, p2/M, z16.h, z17.h\n"
+    "ld1h { z16.h }, p2/Z, [x9, #-2, MUL VL]\n"
+    "ldr x22, [x16, #0x108]\n"
+    "fmla z30.h, p2/M, z21.h, z22.h\n"
+    "fmla z31.h, p2/M, z21.h, z19.h\n"
+    "ld1h { z26.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "ldr x21, [x16, #0x110]\n"
+    "fmla z5.h, p2/M, z21.h, z10.h\n"
+    "fmla z29.h, p2/M, z21.h, z0.h\n"
+    "ld1h { z25.h }, p2/Z, [x9, #-1, MUL VL]\n"
+    "fmla z30.h, p2/M, z4.h, z19.h\n"
+    "fmla z31.h, p2/M, z4.h, z18.h\n"
+    "ld1h { z24.h }, p3/Z, [x27, x13, LSL #1]\n"
+    "ldr x20, [x16, #0x118]\n"
+    "fmla z5.h, p2/M, z4.h, z0.h\n"
+    "fmla z29.h, p2/M, z4.h, z26.h\n"
+    "ld1h { z23.h }, p2/Z, [x9]\n"
+    "fmla z30.h, p2/M, z6.h, z18.h\n"
+    "fmla z31.h, p2/M, z6.h, z9.h\n"
+    "ld1h { z22.h }, p3/Z, [x25, x13, LSL #1]\n"
+    "fmla z5.h, p2/M, z6.h, z26.h\n"
+    "fmla z29.h, p2/M, z6.h, z24.h\n"
+    "ld1h { z21.h }, p2/Z, [x9, #1, MUL VL]\n"
+    "fmla z30.h, p2/M, z11.h, z9.h\n"
+    "fmla z31.h, p2/M, z11.h, z8.h\n"
+    "ld1h { z18.h }, p3/Z, [x24, x13, LSL #1]\n"
+    "fmla z5.h, p2/M, z11.h, z24.h\n"
+    "fmla z29.h, p2/M, z11.h, z27.h\n"
+    "ld1h { z20.h }, p2/Z, [x9, #2, MUL VL]\n"
+    "fmla z30.h, p2/M, z16.h, z8.h\n"
+    "fmla z31.h, p2/M, z16.h, z17.h\n"
+    "ld1h { z17.h }, p3/Z, [x26, x13, LSL #1]\n"
+    "fmla z5.h, p2/M, z16.h, z27.h\n"
+    "fmla z29.h, p2/M, z16.h, z22.h\n"
+    "ld1h { z19.h }, p2/Z, [x9, #3, MUL VL]\n"
+    "fmla z30.h, p2/M, z25.h, z10.h\n"
+    "fmla z31.h, p2/M, z25.h, z0.h\n"
+    "ld1h { z16.h }, p3/Z, [x23, x13, LSL #1]\n"
+    "fmla z5.h, p2/M, z25.h, z18.h\n"
+    "fmla z29.h, p2/M, z25.h, z17.h\n"
+    "ld1h { z18.h }, p3/Z, [x22, x13, LSL #1]\n"
+    "fmla z30.h, p2/M, z23.h, z0.h\n"
+    "fmla z31.h, p2/M, z23.h, z26.h\n"
+    "fmla z5.h, p2/M, z23.h, z17.h\n"
+    "fmla z29.h, p2/M, z23.h, z16.h\n"
+    "ld1h { z17.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "fmla z30.h, p2/M, z21.h, z26.h\n"
+    "fmla z31.h, p2/M, z21.h, z24.h\n"
+    "fmla z5.h, p2/M, z21.h, z16.h\n"
+    "fmla z29.h, p2/M, z21.h, z18.h\n"
+    "ld1h { z16.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "fmla z30.h, p2/M, z20.h, z24.h\n"
+    "fmla z31.h, p2/M, z20.h, z27.h\n"
+    "fmla z5.h, p2/M, z20.h, z18.h\n"
+    "fmla z29.h, p2/M, z20.h, z17.h\n"
+    "fmla z30.h, p2/M, z19.h, z27.h\n"
+    "fmla z31.h, p2/M, z19.h, z22.h\n"
+    "fmax z30.h, p2/M, z30.h, z15.h\n"
+    "fmax z31.h, p2/M, z31.h, z15.h\n"
+    "fmla z5.h, p2/M, z19.h, z17.h\n"
+    "fmla z29.h, p2/M, z19.h, z16.h\n"
+    "fmax z5.h, p2/M, z5.h, z15.h\n"
+    "fmax z29.h, p2/M, z29.h, z15.h\n"
+    "fmin z30.h, p2/M, z30.h, z28.h\n"
+    "fmin z31.h, p2/M, z31.h, z28.h\n"
+    "st1h { z30.h }, p0, [x15, x28, LSL #1]\n"
+    "fmin z5.h, p2/M, z5.h, z28.h\n"
+    "fmin z29.h, p2/M, z29.h, z28.h\n"
+    "st1h { z31.h }, p0, [x14, x28, LSL #1]\n"
+    "st1h { z5.h }, p0, [x12, x28, LSL #1]\n"
+    "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..16b96fdb8e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..1bdef85274
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x10, #0x0\n"
+    "mov x14, #0x0\n"
+    "1:"  // Tile loop
+    "str x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x2\n"
+    "mov x24, #0x2\n"
+    "str x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x10, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x10, x22\n"  // offset = tile_i * ld_output_row
+    "cntw x11\n"
+    "madd x21, x14, x13, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "madd x20, x14, x12, x20\n"  // offset += tile_j * ld_output_col
+    "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "ld1w { z27.s }, p3/Z, [x10]\n"
+    "add x27, x13, x13\n"
+    "mul x21, x21, x25\n"  // offset *= kernel_stride * output_size
+    "add x9, x9, x21, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "ld1w { z2.s }, p3/Z, [x10, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x10, #4, MUL VL]\n"
+    "add x26, x9, x23, LSL #2\n"
+    "ld1w { z4.s }, p3/Z, [x10, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x10, #6, MUL VL]\n"
+    "add x25, x26, x23, LSL #2\n"
+    "add x24, x27, x13\n"
+    "ld1w { z6.s }, p3/Z, [x10, #7, MUL VL]\n"
+    "addvl x10, x10, #16\n"
+    "add x28, x28, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "cmp x11, %x[n_channels]\n"
+    "add x23, x25, x23, LSL #2\n"
+    "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1w { z7.s }, p3/Z, [x10, #-8, MUL VL]\n"
+    "add x22, x28, x22, LSL #2\n"
+    "mov x21, #0x0\n"
+    "ld1w { z8.s }, p3/Z, [x10, #-7, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n"
+    "sub x20, XZR, x11\n"
+    "ld1w { z10.s }, p2/Z, [x9]\n"
+    "ld1w { z11.s }, p2/Z, [x9, x24, LSL #2]\n"
+    "addvl x10, x10, #-6\n"
+    "ld1w { z12.s }, p2/Z, [x26, x27, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x25, x13, LSL #2]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z24, z27\n fmla z24.s, p3/M, z4.s, z9.s\n"
+    "movprfx z23, z27\n fmla z23.s, p3/M, z3.s, z9.s\n"
+    "whilelt p1.s, x11, %x[n_channels]\n"
+    "incw x21\n"
+    "movprfx z22, z27\n fmla z22.s, p3/M, z1.s, z9.s\n"
+    "movprfx z21, z27\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z18.s }, p2/Z, [x23]\n"
+    "incw x11\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "fmla z23.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x24, LSL #2]\n"
+    "ld1w { z20.s }, p2/Z, [x25, x27, LSL #2]\n"
+    "fmla z22.s, p3/M, z2.s, z12.s\n"
+    "fmla z21.s, p3/M, z1.s, z12.s\n"
+    "mov p0.b, p2.b\n"
+    "ld1w { z27.s }, p3/Z, [x10]\n"
+    "fmla z24.s, p3/M, z5.s, z12.s\n"
+    "fmla z23.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, x13, LSL #2]\n"
+    "incw x20\n"
+    "fmla z22.s, p3/M, z6.s, z18.s\n"
+    "fmla z21.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z18.s }, p2/Z, [x9, x27, LSL #2]\n"
+    "addvl x9, x9, #1\n"
+    "fmla z24.s, p3/M, z7.s, z13.s\n"
+    "fmla z23.s, p3/M, z6.s, z13.s\n"
+    "fmla z22.s, p3/M, z4.s, z13.s\n"
+    "fmla z21.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x26]\n"
+    "fmla z24.s, p3/M, z1.s, z16.s\n"
+    "fmla z23.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x26, x24, LSL #2]\n"
+    "addvl x26, x26, #1\n"
+    "fmla z22.s, p3/M, z5.s, z20.s\n"
+    "fmla z21.s, p3/M, z4.s, z20.s\n"
+    "ld1w { z4.s }, p3/Z, [x10, #5, MUL VL]\n"
+    "fmla z24.s, p3/M, z2.s, z18.s\n"
+    "fmla z23.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x25]\n"
+    "ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
+    "fmla z22.s, p3/M, z0.s, z17.s\n"
+    "fmla z21.s, p3/M, z2.s, z16.s\n"
+    "ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
+    "ld1w { z2.s }, p3/Z, [x10, #3, MUL VL]\n"
+    "fmla z24.s, p3/M, z8.s, z20.s\n"
+    "fmla z23.s, p3/M, z7.s, z20.s\n"
+    "ld1w { z18.s }, p2/Z, [x25, x24, LSL #2]\n"
+    "addvl x25, x25, #1\n"
+    "fmla z22.s, p3/M, z3.s, z19.s\n"
+    "fmla z21.s, p3/M, z5.s, z18.s\n"
+    "ld1w { z13.s }, p1/Z, [x25, x13, LSL #2]\n"
+    "fmla z24.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x13, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x27, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z17.s\n"
+    "fmla z21.s, p3/M, z6.s, z17.s\n"
+    "ld1w { z3.s }, p3/Z, [x10, #4, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x10, #6, MUL VL]\n"
+    "fmla z24.s, p3/M, z6.s, z19.s\n"
+    "fmla z23.s, p3/M, z8.s, z18.s\n"
+    "fmax z24.s, p3/M, z24.s, z26.s\n"
+    "fmax z23.s, p3/M, z23.s, z26.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "fmax z22.s, p3/M, z22.s, z26.s\n"
+    "fmax z21.s, p3/M, z21.s, z26.s\n"
+    "ld1w { z6.s }, p3/Z, [x10, #7, MUL VL]\n"
+    "addvl x10, x10, #16\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "ld1w { z9.s }, p1/Z, [x26, x13, LSL #2]\n"
+    "cmp x11, %x[n_channels]\n"
+    "fmin z24.s, p3/M, z24.s, z25.s\n"
+    "ld1w { z10.s }, p1/Z, [x9]\n"
+    "ld1w { z11.s }, p1/Z, [x9, x24, LSL #2]\n"
+    "fmin z23.s, p3/M, z23.s, z25.s\n"
+    "fmin z22.s, p3/M, z22.s, z25.s\n"
+    "ld1w { z12.s }, p1/Z, [x26, x27, LSL #2]\n"
+    "st1w { z24.s }, p0, [x28]\n"
+    "fmin z21.s, p3/M, z21.s, z25.s\n"
+    "addvl x23, x23, #1\n"
+    "st1w { z23.s }, p0, [x28, x12, LSL #2]\n"
+    "ld1w { z7.s }, p3/Z, [x10, #-8, MUL VL]\n"
+    "st1w { z22.s }, p0, [x22]\n"
+    "addvl x28, x28, #1\n"
+    "ld1w { z8.s }, p3/Z, [x10, #-7, MUL VL]\n"
+    "addvl x10, x10, #-6\n"
+    "st1w { z21.s }, p0, [x22, x12, LSL #2]\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z24, z27\n fmla z24.s, p3/M, z4.s, z9.s\n"
+    "movprfx z23, z27\n fmla z23.s, p3/M, z3.s, z9.s\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "movprfx z22, z27\n fmla z22.s, p3/M, z1.s, z9.s\n"
+    "movprfx z21, z27\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z18.s }, p2/Z, [x23]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "fmla z23.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x24, LSL #2]\n"
+    "ld1w { z20.s }, p2/Z, [x25, x27, LSL #2]\n"
+    "fmla z22.s, p3/M, z2.s, z12.s\n"
+    "fmla z21.s, p3/M, z1.s, z12.s\n"
+    "add x14, x14, #0x1\n"
+    "cmp x14, x20\n"
+    "fmla z24.s, p3/M, z5.s, z12.s\n"
+    "fmla z23.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, x13, LSL #2]\n"
+    "add x21, x10, #0x1\n"
+    "fmla z22.s, p3/M, z6.s, z18.s\n"
+    "fmla z21.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z18.s }, p2/Z, [x9, x27, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z24.s, p3/M, z7.s, z13.s\n"
+    "fmla z23.s, p3/M, z6.s, z13.s\n"
+    "csel x10, x10, x21, LT\n"
+    "mov p0.b, p2.b\n"
+    "fmla z22.s, p3/M, z4.s, z13.s\n"
+    "fmla z21.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x26]\n"
+    "csel x14, x14, XZR, LT\n"
+    "fmla z24.s, p3/M, z1.s, z16.s\n"
+    "fmla z23.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x26, x24, LSL #2]\n"
+    "cmp x10, x20\n"
+    "fmla z22.s, p3/M, z5.s, z20.s\n"
+    "fmla z21.s, p3/M, z4.s, z20.s\n"
+    "fmla z24.s, p3/M, z2.s, z18.s\n"
+    "fmla z23.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x25]\n"
+    "fmla z22.s, p3/M, z0.s, z17.s\n"
+    "fmla z21.s, p3/M, z2.s, z16.s\n"
+    "fmla z24.s, p3/M, z8.s, z20.s\n"
+    "fmla z23.s, p3/M, z7.s, z20.s\n"
+    "ld1w { z18.s }, p2/Z, [x25, x24, LSL #2]\n"
+    "fmla z22.s, p3/M, z3.s, z19.s\n"
+    "fmla z21.s, p3/M, z5.s, z18.s\n"
+    "fmla z24.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x13, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x27, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z17.s\n"
+    "fmla z21.s, p3/M, z6.s, z17.s\n"
+    "fmla z24.s, p3/M, z6.s, z19.s\n"
+    "fmla z23.s, p3/M, z8.s, z18.s\n"
+    "fmax z24.s, p3/M, z24.s, z26.s\n"
+    "fmax z23.s, p3/M, z23.s, z26.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "fmax z22.s, p3/M, z22.s, z26.s\n"
+    "fmax z21.s, p3/M, z21.s, z26.s\n"
+    "fmin z24.s, p3/M, z24.s, z25.s\n"
+    "fmin z23.s, p3/M, z23.s, z25.s\n"
+    "st1w { z24.s }, p0, [x28]\n"
+    "fmin z22.s, p3/M, z22.s, z25.s\n"
+    "fmin z21.s, p3/M, z21.s, z25.s\n"
+    "st1w { z23.s }, p0, [x28, x12, LSL #2]\n"
+    "st1w { z22.s }, p0, [x22]\n"
+    "st1w { z21.s }, p0, [x22, x12, LSL #2]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..873b4736ff
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[16];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[5];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[3];
+      inptrs[3] = input_ptrs[6];
+      inptrs[4] = input_ptrs[9];
+      inptrs[5] = input_ptrs[12];
+      inptrs[6] = input_ptrs[15];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[2];
+      inptrs[9] = input_ptrs[10];
+      inptrs[10] = input_ptrs[4];
+      inptrs[11] = input_ptrs[7];
+      inptrs[12] = input_ptrs[8];
+      inptrs[13] = input_ptrs[11];
+      inptrs[14] = input_ptrs[13];
+      inptrs[15] = input_ptrs[14];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "cntw x14\n"
+    "ldp x13, x12, [x20, #0x0]\n"
+    "ldp x11, x10, [x20, #0x10]\n"
+    "mov x9, #0x0\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z20.s }, p3/Z, [x16]\n"
+    "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+    "cmp x14, %x[n_channels]\n"
+    "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+    "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+    "sub x28, XZR, x14\n"
+    "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+    "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x24, x9, LSL #2]\n"
+    "addvl x16, x16, #-6\n"
+    "ld1w { z10.s }, p2/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z24, z20\n fmla z24.s, p3/M, z4.s, z9.s\n"
+    "movprfx z23, z20\n fmla z23.s, p3/M, z3.s, z9.s\n"
+    "ldr x21, [x15, #0x28]\n"
+    "ldr x20, [x15, #0x30]\n"
+    "movprfx z22, z20\n fmla z22.s, p3/M, z1.s, z9.s\n"
+    "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ldr x22, [x15, #0x38]\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "fmla z23.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x48]\n"
+    "fmla z22.s, p3/M, z2.s, z12.s\n"
+    "fmla z21.s, p3/M, z1.s, z12.s\n"
+    "ldr x20, [x15, #0x40]\n"
+    "ld1w { z20.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z24.s, p3/M, z5.s, z12.s\n"
+    "fmla z23.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "ldr x22, [x15, #0x50]\n"
+    "fmla z22.s, p3/M, z6.s, z18.s\n"
+    "fmla z21.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x58]\n"
+    "fmla z24.s, p3/M, z7.s, z13.s\n"
+    "fmla z23.s, p3/M, z6.s, z13.s\n"
+    "ldr x20, [x15, #0x60]\n"
+    "ldr x27, [x15, #0x68]\n"
+    "fmla z22.s, p3/M, z4.s, z13.s\n"
+    "fmla z21.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "ldr x26, [x15, #0x70]\n"
+    "fmla z24.s, p3/M, z1.s, z16.s\n"
+    "fmla z23.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ldr x25, [x15, #0x78]\n"
+    "fmla z22.s, p3/M, z5.s, z20.s\n"
+    "fmla z21.s, p3/M, z4.s, z20.s\n"
+    "whilelt p1.s, x14, %x[n_channels]\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "fmla z24.s, p3/M, z2.s, z18.s\n"
+    "fmla z23.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "fmla z22.s, p3/M, z0.s, z17.s\n"
+    "fmla z21.s, p3/M, z2.s, z16.s\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ld1w { z13.s }, p1/Z, [x20, x14, LSL #2]\n"
+    "fmla z24.s, p3/M, z8.s, z20.s\n"
+    "fmla z23.s, p3/M, z7.s, z20.s\n"
+    "ld1w { z18.s }, p2/Z, [x27, x9, LSL #2]\n"
+    "incw x28\n"
+    "fmla z22.s, p3/M, z3.s, z19.s\n"
+    "fmla z21.s, p3/M, z5.s, z18.s\n"
+    "mov p0.b, p2.b\n"
+    "ld1w { z20.s }, p3/Z, [x16]\n"
+    "fmla z24.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x26, x9, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z17.s\n"
+    "fmla z21.s, p3/M, z6.s, z17.s\n"
+    "incw x9\n"
+    "ld1w { z11.s }, p1/Z, [x22, x14, LSL #2]\n"
+    "fmla z24.s, p3/M, z6.s, z19.s\n"
+    "fmla z23.s, p3/M, z8.s, z18.s\n"
+    "ld1w { z9.s }, p1/Z, [x24, x14, LSL #2]\n"
+    "ld1w { z10.s }, p1/Z, [x23, x14, LSL #2]\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z12.s }, p1/Z, [x21, x14, LSL #2]\n"
+    "incw x14\n"
+    "fmax z24.s, p3/M, z24.s, z26.s\n"
+    "fmax z23.s, p3/M, z23.s, z26.s\n"
+    "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+    "fmax z22.s, p3/M, z22.s, z26.s\n"
+    "fmax z21.s, p3/M, z21.s, z26.s\n"
+    "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+    "whilelt p2.s, x9, %x[n_channels]\n"
+    "cmp x14, %x[n_channels]\n"
+    "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "fmin z24.s, p3/M, z24.s, z25.s\n"
+    "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+    "fmin z23.s, p3/M, z23.s, z25.s\n"
+    "fmin z22.s, p3/M, z22.s, z25.s\n"
+    "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
+    "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+    "fmin z21.s, p3/M, z21.s, z25.s\n"
+    "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
+    "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+    "addvl x16, x16, #-6\n"
+    "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z24, z20\n fmla z24.s, p3/M, z4.s, z9.s\n"
+    "movprfx z23, z20\n fmla z23.s, p3/M, z3.s, z9.s\n"
+    "ldr x21, [x15, #0x28]\n"
+    "ldr x20, [x15, #0x30]\n"
+    "movprfx z22, z20\n fmla z22.s, p3/M, z1.s, z9.s\n"
+    "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ldr x22, [x15, #0x38]\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "fmla z23.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x48]\n"
+    "fmla z22.s, p3/M, z2.s, z12.s\n"
+    "fmla z21.s, p3/M, z1.s, z12.s\n"
+    "ldr x20, [x15, #0x40]\n"
+    "ld1w { z20.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z24.s, p3/M, z5.s, z12.s\n"
+    "fmla z23.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x50]\n"
+    "fmla z22.s, p3/M, z6.s, z18.s\n"
+    "fmla z21.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x15, #0x58]\n"
+    "fmla z24.s, p3/M, z7.s, z13.s\n"
+    "fmla z23.s, p3/M, z6.s, z13.s\n"
+    "ldr x23, [x15, #0x60]\n"
+    "ldr x22, [x15, #0x68]\n"
+    "fmla z22.s, p3/M, z4.s, z13.s\n"
+    "fmla z21.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x70]\n"
+    "fmla z24.s, p3/M, z1.s, z16.s\n"
+    "fmla z23.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "fmla z22.s, p3/M, z5.s, z20.s\n"
+    "fmla z21.s, p3/M, z4.s, z20.s\n"
+    "incw x28\n"
+    "mov p0.b, p2.b\n"
+    "fmla z24.s, p3/M, z2.s, z18.s\n"
+    "fmla z23.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x23, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z0.s, z17.s\n"
+    "fmla z21.s, p3/M, z2.s, z16.s\n"
+    "fmla z24.s, p3/M, z8.s, z20.s\n"
+    "fmla z23.s, p3/M, z7.s, z20.s\n"
+    "ld1w { z18.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z3.s, z19.s\n"
+    "fmla z21.s, p3/M, z5.s, z18.s\n"
+    "fmla z24.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z17.s\n"
+    "fmla z21.s, p3/M, z6.s, z17.s\n"
+    "fmla z24.s, p3/M, z6.s, z19.s\n"
+    "fmla z23.s, p3/M, z8.s, z18.s\n"
+    "fmax z24.s, p3/M, z24.s, z26.s\n"
+    "fmax z23.s, p3/M, z23.s, z26.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "fmax z22.s, p3/M, z22.s, z26.s\n"
+    "fmax z21.s, p3/M, z21.s, z26.s\n"
+    "fmin z24.s, p3/M, z24.s, z25.s\n"
+    "fmin z23.s, p3/M, z23.s, z25.s\n"
+    "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+    "fmin z22.s, p3/M, z22.s, z25.s\n"
+    "fmin z21.s, p3/M, z21.s, z25.s\n"
+    "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
+    "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
+    "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..e4f432c9ed
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..015d0e63c2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x13, #0x0\n"
+    "mov x8, #0x0\n"
+    "1:"  // Tile loop
+    "str x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x3\n"
+    "mov x24, #0x3\n"
+    "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x13, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x8, x17, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "cntw x15\n"
+    "mul x20, x13, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x12, x17, x17\n"
+    "mul x22, x22, x25\n"  // offset *= kernel_stride * output_size
+    "add x14, x14, x22, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x10, x14, x23, LSL #2\n"
+    "madd x20, x8, x16, x20\n"  // offset += tile_j * ld_output_col
+    "add x9, x10, x23, LSL #2\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z14.s }, p3/Z, [x13]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "ld1w { z0.s }, p3/Z, [x13, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x13, #2, MUL VL]\n"
+    "add x28, x9, x23, LSL #2\n"
+    "ld1w { z2.s }, p3/Z, [x13, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x13, #4, MUL VL]\n"
+    "add x27, x12, x17\n"
+    "add x11, x11, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "ld1w { z4.s }, p3/Z, [x13, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x13, #6, MUL VL]\n"
+    "add x26, x28, x23, LSL #2\n"
+    "add x25, x27, x17\n"
+    "ld1w { z6.s }, p3/Z, [x13, #7, MUL VL]\n"
+    "addvl x13, x13, #16\n"
+    "add x24, x11, x21, LSL #2\n"
+    "ld1rw { z31.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "cmp x15, %x[n_channels]\n"
+    "add x23, x24, x21, LSL #2\n"
+    "ld1rw { z30.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1w { z7.s }, p3/Z, [x13, #-8, MUL VL]\n"
+    "add x22, x16, x16\n"
+    "mov x21, #0x0\n"
+    "ld1w { z8.s }, p3/Z, [x13, #-7, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x9, x12, LSL #2]\n"
+    "sub x20, XZR, x15\n"
+    "ld1w { z10.s }, p2/Z, [x14]\n"
+    "ld1w { z11.s }, p2/Z, [x14, x25, LSL #2]\n"
+    "addvl x13, x13, #-6\n"
+    "ld1w { z12.s }, p2/Z, [x26]\n"
+    "ld1w { z13.s }, p2/Z, [x10, x12, LSL #2]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
+    "movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
+    "whilelt p1.s, x15, %x[n_channels]\n"
+    "incw x21\n"
+    "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+    "fmla z29.s, p3/M, z4.s, z13.s\n"
+    "incw x15\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+    "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+    "incw x20\n"
+    "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z23.s }, p2/Z, [x9, x27, LSL #2]\n"
+    "fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z18.s }, p2/Z, [x9, x17, LSL #2]\n"
+    "movprfx z22, z14\n fmla z22.s, p3/M, z2.s, z9.s\n"
+    "fmla z29.s, p3/M, z6.s, z18.s\n"
+    "movprfx z21, z14\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "fmla z28.s, p3/M, z5.s, z13.s\n"
+    "fmla z27.s, p3/M, z3.s, z13.s\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "fmla z25.s, p3/M, z1.s, z13.s\n"
+    "fmla z24.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x14, x17, LSL #2]\n"
+    "fmla z22.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x26, x25, LSL #2]\n"
+    "movprfx z20, z14\n fmla z20.s, p3/M, z1.s, z9.s\n"
+    "fmla z29.s, p3/M, z0.s, z17.s\n"
+    "ld1w { z14.s }, p3/Z, [x13]\n"
+    "fmla z21.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x27, LSL #2]\n"
+    "fmla z28.s, p3/M, z7.s, z18.s\n"
+    "fmla z20.s, p3/M, z0.s, z18.s\n"
+    "fmla z26.s, p3/M, z4.s, z18.s\n"
+    "fmla z25.s, p3/M, z3.s, z18.s\n"
+    "fmla z22.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x10]\n"
+    "fmla z29.s, p3/M, z2.s, z16.s\n"
+    "fmla z27.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x28]\n"
+    "fmla z24.s, p3/M, z4.s, z23.s\n"
+    "fmla z28.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x10, x25, LSL #2]\n"
+    "fmla z20.s, p3/M, z2.s, z23.s\n"
+    "fmla z21.s, p3/M, z1.s, z23.s\n"
+    "fmla z29.s, p3/M, z8.s, z23.s\n"
+    "fmla z27.s, p3/M, z7.s, z23.s\n"
+    "fmla z25.s, p3/M, z5.s, z23.s\n"
+    "fmla z26.s, p3/M, z0.s, z19.s\n"
+    "ld1w { z17.s }, p2/Z, [x28, x12, LSL #2]\n"
+    "fmla z22.s, p3/M, z3.s, z18.s\n"
+    "fmla z24.s, p3/M, z2.s, z16.s\n"
+    "fmla z20.s, p3/M, z4.s, z17.s\n"
+    "fmla z21.s, p3/M, z3.s, z17.s\n"
+    "fmla z28.s, p3/M, z3.s, z19.s\n"
+    "fmla z27.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x28, x25, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x26, x17, LSL #2]\n"
+    "fmla z26.s, p3/M, z6.s, z18.s\n"
+    "fmla z25.s, p3/M, z7.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x10, x17, LSL #2]\n"
+    "fmla z22.s, p3/M, z5.s, z17.s\n"
+    "fmla z24.s, p3/M, z6.s, z17.s\n"
+    "fmla z21.s, p3/M, z5.s, z19.s\n"
+    "fmla z20.s, p3/M, z6.s, z16.s\n"
+    "fmla z26.s, p3/M, z8.s, z17.s\n"
+    "fmla z22.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x26, x27, LSL #2]\n"
+    "fmla z29.s, p3/M, z3.s, z18.s\n"
+    "fmla z25.s, p3/M, z0.s, z18.s\n"
+    "fmla z24.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x10, x27, LSL #2]\n"
+    "fmla z20.s, p3/M, z8.s, z17.s\n"
+    "addvl x10, x10, #1\n"
+    "fmla z21.s, p3/M, z7.s, z17.s\n"
+    "fmla z28.s, p3/M, z4.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
+    "fmla z26.s, p3/M, z1.s, z18.s\n"
+    "fmla z29.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x28, x17, LSL #2]\n"
+    "addvl x28, x28, #1\n"
+    "fmla z27.s, p3/M, z4.s, z16.s\n"
+    "fmla z25.s, p3/M, z2.s, z16.s\n"
+    "fmla z24.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x12, LSL #2]\n"
+    "fmla z22.s, p3/M, z4.s, z17.s\n"
+    "addvl x14, x14, #1\n"
+    "fmla z20.s, p3/M, z3.s, z17.s\n"
+    "fmla z21.s, p3/M, z4.s, z19.s\n"
+    "ld1w { z4.s }, p3/Z, [x13, #5, MUL VL]\n"
+    "ld1w { z10.s }, p1/Z, [x14]\n"
+    "fmla z26.s, p3/M, z7.s, z17.s\n"
+    "fmla z25.s, p3/M, z6.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x9]\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z29.s, p3/M, z1.s, z16.s\n"
+    "fmax z29.s, p3/M, z29.s, z31.s\n"
+    "ld1w { z1.s }, p3/Z, [x13, #2, MUL VL]\n"
+    "fmla z27.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x9, x25, LSL #2]\n"
+    "fmla z24.s, p3/M, z7.s, z19.s\n"
+    "addvl x9, x9, #1\n"
+    "fmla z20.s, p3/M, z5.s, z19.s\n"
+    "fmla z22.s, p3/M, z0.s, z18.s\n"
+    "ld1w { z0.s }, p3/Z, [x13, #1, MUL VL]\n"
+    "fmin z29.s, p3/M, z29.s, z30.s\n"
+    "fmla z21.s, p3/M, z2.s, z17.s\n"
+    "fmla z25.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x26, x12, LSL #2]\n"
+    "fmax z25.s, p3/M, z25.s, z31.s\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "fmla z26.s, p3/M, z3.s, z18.s\n"
+    "fmax z28.s, p3/M, z28.s, z31.s\n"
+    "fmax z26.s, p3/M, z26.s, z31.s\n"
+    "fmla z27.s, p3/M, z8.s, z17.s\n"
+    "fmla z24.s, p3/M, z5.s, z17.s\n"
+    "fmax z27.s, p3/M, z27.s, z31.s\n"
+    "fmax z24.s, p3/M, z24.s, z31.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "fmla z20.s, p3/M, z7.s, z16.s\n"
+    "fmax z22.s, p3/M, z22.s, z31.s\n"
+    "fmax z20.s, p3/M, z20.s, z31.s\n"
+    "fmla z21.s, p3/M, z6.s, z16.s\n"
+    "fmax z21.s, p3/M, z21.s, z31.s\n"
+    "addvl x26, x26, #1\n"
+    "ld1w { z2.s }, p3/Z, [x13, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x13, #4, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x13, #6, MUL VL]\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "cmp x15, %x[n_channels]\n"
+    "ld1w { z6.s }, p3/Z, [x13, #7, MUL VL]\n"
+    "addvl x13, x13, #16\n"
+    "fmin z28.s, p3/M, z28.s, z30.s\n"
+    "ld1w { z9.s }, p1/Z, [x9, x12, LSL #2]\n"
+    "fmin z27.s, p3/M, z27.s, z30.s\n"
+    "fmin z26.s, p3/M, z26.s, z30.s\n"
+    "ld1w { z11.s }, p1/Z, [x14, x25, LSL #2]\n"
+    "ld1w { z12.s }, p1/Z, [x26]\n"
+    "fmin z25.s, p3/M, z25.s, z30.s\n"
+    "fmin z24.s, p3/M, z24.s, z30.s\n"
+    "ld1w { z13.s }, p1/Z, [x10, x12, LSL #2]\n"
+    "st1w { z28.s }, p0, [x11]\n"
+    "fmin z22.s, p3/M, z22.s, z30.s\n"
+    "fmin z20.s, p3/M, z20.s, z30.s\n"
+    "st1w { z29.s }, p0, [x11, x16, LSL #2]\n"
+    "ld1w { z7.s }, p3/Z, [x13, #-8, MUL VL]\n"
+    "fmin z21.s, p3/M, z21.s, z30.s\n"
+    "st1w { z27.s }, p0, [x11, x22, LSL #2]\n"
+    "addvl x11, x11, #1\n"
+    "ld1w { z8.s }, p3/Z, [x13, #-7, MUL VL]\n"
+    "st1w { z26.s }, p0, [x24]\n"
+    "addvl x13, x13, #-6\n"
+    "st1w { z25.s }, p0, [x24, x16, LSL #2]\n"
+    "st1w { z24.s }, p0, [x24, x22, LSL #2]\n"
+    "addvl x24, x24, #1\n"
+    "st1w { z22.s }, p0, [x23]\n"
+    "st1w { z20.s }, p0, [x23, x16, LSL #2]\n"
+    "st1w { z21.s }, p0, [x23, x22, LSL #2]\n"
+    "addvl x23, x23, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
+    "movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+    "fmla z29.s, p3/M, z4.s, z13.s\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "add x8, x8, #0x1\n"
+    "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+    "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+    "cmp x8, x20\n"
+    "add x21, x13, #0x1\n"
+    "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z23.s }, p2/Z, [x9, x27, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z18.s }, p2/Z, [x9, x17, LSL #2]\n"
+    "movprfx z22, z14\n fmla z22.s, p3/M, z2.s, z9.s\n"
+    "csel x13, x13, x21, LT\n"
+    "fmla z29.s, p3/M, z6.s, z18.s\n"
+    "movprfx z21, z14\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "mov p0.b, p2.b\n"
+    "csel x8, x8, XZR, LT\n"
+    "fmla z28.s, p3/M, z5.s, z13.s\n"
+    "fmla z27.s, p3/M, z3.s, z13.s\n"
+    "cmp x13, x20\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "fmla z25.s, p3/M, z1.s, z13.s\n"
+    "fmla z24.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x14, x17, LSL #2]\n"
+    "fmla z22.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x26, x25, LSL #2]\n"
+    "movprfx z20, z14\n fmla z20.s, p3/M, z1.s, z9.s\n"
+    "fmla z29.s, p3/M, z0.s, z17.s\n"
+    "fmla z21.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x27, LSL #2]\n"
+    "fmla z28.s, p3/M, z7.s, z18.s\n"
+    "fmla z20.s, p3/M, z0.s, z18.s\n"
+    "fmla z26.s, p3/M, z4.s, z18.s\n"
+    "fmla z25.s, p3/M, z3.s, z18.s\n"
+    "fmla z22.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x10]\n"
+    "fmla z29.s, p3/M, z2.s, z16.s\n"
+    "fmla z27.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x28]\n"
+    "fmla z24.s, p3/M, z4.s, z23.s\n"
+    "fmla z28.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x10, x25, LSL #2]\n"
+    "fmla z20.s, p3/M, z2.s, z23.s\n"
+    "fmla z21.s, p3/M, z1.s, z23.s\n"
+    "fmla z29.s, p3/M, z8.s, z23.s\n"
+    "fmla z27.s, p3/M, z7.s, z23.s\n"
+    "fmla z25.s, p3/M, z5.s, z23.s\n"
+    "fmla z26.s, p3/M, z0.s, z19.s\n"
+    "ld1w { z17.s }, p2/Z, [x28, x12, LSL #2]\n"
+    "fmla z22.s, p3/M, z3.s, z18.s\n"
+    "fmla z24.s, p3/M, z2.s, z16.s\n"
+    "fmla z20.s, p3/M, z4.s, z17.s\n"
+    "fmla z21.s, p3/M, z3.s, z17.s\n"
+    "fmla z28.s, p3/M, z3.s, z19.s\n"
+    "fmla z27.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x28, x25, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x26, x17, LSL #2]\n"
+    "fmla z26.s, p3/M, z6.s, z18.s\n"
+    "fmla z25.s, p3/M, z7.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x10, x17, LSL #2]\n"
+    "fmla z22.s, p3/M, z5.s, z17.s\n"
+    "fmla z24.s, p3/M, z6.s, z17.s\n"
+    "fmla z21.s, p3/M, z5.s, z19.s\n"
+    "fmla z20.s, p3/M, z6.s, z16.s\n"
+    "fmla z26.s, p3/M, z8.s, z17.s\n"
+    "fmla z22.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x26, x27, LSL #2]\n"
+    "fmla z29.s, p3/M, z3.s, z18.s\n"
+    "fmla z25.s, p3/M, z0.s, z18.s\n"
+    "fmla z24.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x10, x27, LSL #2]\n"
+    "fmla z20.s, p3/M, z8.s, z17.s\n"
+    "fmla z21.s, p3/M, z7.s, z17.s\n"
+    "fmla z28.s, p3/M, z4.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
+    "fmla z26.s, p3/M, z1.s, z18.s\n"
+    "fmla z29.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x28, x17, LSL #2]\n"
+    "fmla z27.s, p3/M, z4.s, z16.s\n"
+    "fmla z25.s, p3/M, z2.s, z16.s\n"
+    "fmla z24.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x12, LSL #2]\n"
+    "fmla z22.s, p3/M, z4.s, z17.s\n"
+    "fmla z20.s, p3/M, z3.s, z17.s\n"
+    "fmla z21.s, p3/M, z4.s, z19.s\n"
+    "fmla z26.s, p3/M, z7.s, z17.s\n"
+    "fmla z25.s, p3/M, z6.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x9]\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z29.s, p3/M, z1.s, z16.s\n"
+    "fmax z29.s, p3/M, z29.s, z31.s\n"
+    "fmin z29.s, p3/M, z29.s, z30.s\n"
+    "fmla z27.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x9, x25, LSL #2]\n"
+    "fmla z24.s, p3/M, z7.s, z19.s\n"
+    "fmla z20.s, p3/M, z5.s, z19.s\n"
+    "fmla z22.s, p3/M, z0.s, z18.s\n"
+    "fmla z21.s, p3/M, z2.s, z17.s\n"
+    "fmla z25.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x26, x12, LSL #2]\n"
+    "fmax z25.s, p3/M, z25.s, z31.s\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "fmla z26.s, p3/M, z3.s, z18.s\n"
+    "fmax z28.s, p3/M, z28.s, z31.s\n"
+    "fmax z26.s, p3/M, z26.s, z31.s\n"
+    "fmla z27.s, p3/M, z8.s, z17.s\n"
+    "fmla z24.s, p3/M, z5.s, z17.s\n"
+    "fmax z27.s, p3/M, z27.s, z31.s\n"
+    "fmax z24.s, p3/M, z24.s, z31.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "fmla z20.s, p3/M, z7.s, z16.s\n"
+    "fmax z22.s, p3/M, z22.s, z31.s\n"
+    "fmax z20.s, p3/M, z20.s, z31.s\n"
+    "fmla z21.s, p3/M, z6.s, z16.s\n"
+    "fmax z21.s, p3/M, z21.s, z31.s\n"
+    "fmin z28.s, p3/M, z28.s, z30.s\n"
+    "st1w { z28.s }, p0, [x11]\n"
+    "fmin z27.s, p3/M, z27.s, z30.s\n"
+    "fmin z26.s, p3/M, z26.s, z30.s\n"
+    "st1w { z29.s }, p0, [x11, x16, LSL #2]\n"
+    "fmin z25.s, p3/M, z25.s, z30.s\n"
+    "fmin z24.s, p3/M, z24.s, z30.s\n"
+    "st1w { z27.s }, p0, [x11, x22, LSL #2]\n"
+    "fmin z22.s, p3/M, z22.s, z30.s\n"
+    "fmin z20.s, p3/M, z20.s, z30.s\n"
+    "st1w { z26.s }, p0, [x24]\n"
+    "fmin z21.s, p3/M, z21.s, z30.s\n"
+    "st1w { z25.s }, p0, [x24, x16, LSL #2]\n"
+    "st1w { z24.s }, p0, [x24, x22, LSL #2]\n"
+    "st1w { z22.s }, p0, [x23]\n"
+    "st1w { z20.s }, p0, [x23, x16, LSL #2]\n"
+    "st1w { z21.s }, p0, [x23, x22, LSL #2]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..4809b0c45c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[25];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[4];
+      inptrs[3] = input_ptrs[20];
+      inptrs[4] = input_ptrs[7];
+      inptrs[5] = input_ptrs[24];
+      inptrs[6] = input_ptrs[11];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[3];
+      inptrs[9] = input_ptrs[13];
+      inptrs[10] = input_ptrs[5];
+      inptrs[11] = input_ptrs[9];
+      inptrs[12] = input_ptrs[15];
+      inptrs[13] = input_ptrs[17];
+      inptrs[14] = input_ptrs[19];
+      inptrs[15] = input_ptrs[21];
+      inptrs[16] = input_ptrs[6];
+      inptrs[17] = input_ptrs[8];
+      inptrs[18] = input_ptrs[23];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[18];
+      inptrs[22] = input_ptrs[10];
+      inptrs[23] = input_ptrs[14];
+      inptrs[24] = input_ptrs[22];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1w { z14.s }, p3/Z, [x8]\n"
+    "cntw x16\n"
+    "mov x15, #0x0\n"
+    "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+    "cmp x16, %x[n_channels]\n"
+    "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
+    "sub x14, XZR, x16\n"
+    "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
+    "addvl x8, x8, #16\n"
+    "ldp x24, x23, [x17, #0x0]\n"
+    "ldp x22, x21, [x17, #0x10]\n"
+    "ldr x20, [x17, #0x20]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ld1rw { z31.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z30.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "addvl x8, x8, #-6\n"
+    "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z29, z14\n fmla z29.s, p3/M, z8.s, z9.s\n"
+    "movprfx z28, z14\n fmla z28.s, p3/M, z7.s, z9.s\n"
+    "ldr x23, [x17, #0x30]\n"
+    "ldr x26, [x17, #0x38]\n"
+    "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+    "fmla z29.s, p3/M, z0.s, z10.s\n"
+    "ldr x22, [x17, #0x28]\n"
+    "ldr x21, [x17, #0x48]\n"
+    "fmla z28.s, p3/M, z4.s, z13.s\n"
+    "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+    "ldr x20, [x17, #0x40]\n"
+    "ld1w { z19.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+    "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+    "ldr x25, [x17, #0x50]\n"
+    "ldr x24, [x17, #0x58]\n"
+    "fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "movprfx z23, z14\n fmla z23.s, p3/M, z2.s, z9.s\n"
+    "ldr x23, [x17, #0x60]\n"
+    "fmla z29.s, p3/M, z5.s, z13.s\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "ldr x12, [x17, #0x70]\n"
+    "ldr x11, [x17, #0x88]\n"
+    "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
+    "fmla z27.s, p3/M, z3.s, z13.s\n"
+    "incw x14\n"
+    "mov p1.b, p2.b\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "fmla z25.s, p3/M, z1.s, z13.s\n"
+    "ldr x10, [x13, #0x0]\n"
+    "whilelt p0.s, x16, %x[n_channels]\n"
+    "fmla z24.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "fmla z23.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
+    "fmla z29.s, p3/M, z7.s, z18.s\n"
+    "ldr x22, [x17, #0x68]\n"
+    "ldr x21, [x17, #0x78]\n"
+    "fmla z28.s, p3/M, z0.s, z17.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x20, [x17, #0x80]\n"
+    "fmla z26.s, p3/M, z4.s, z18.s\n"
+    "fmla z25.s, p3/M, z3.s, z18.s\n"
+    "ldr x9, [x13, #0x8]\n"
+    "ldr x28, [x13, #0x10]\n"
+    "fmla z21.s, p3/M, z0.s, z18.s\n"
+    "fmla z24.s, p3/M, z4.s, z19.s\n"
+    "ldr x27, [x13, #0x18]\n"
+    "ld1w { z14.s }, p3/Z, [x8]\n"
+    "fmla z23.s, p3/M, z1.s, z18.s\n"
+    "fmla z29.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z27.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x26, [x17, #0x90]\n"
+    "fmla z25.s, p3/M, z5.s, z19.s\n"
+    "fmla z21.s, p3/M, z2.s, z19.s\n"
+    "ldr x25, [x17, #0xa0]\n"
+    "ldr x24, [x17, #0x98]\n"
+    "fmla z26.s, p3/M, z0.s, z20.s\n"
+    "fmla z24.s, p3/M, z2.s, z17.s\n"
+    "fmla z28.s, p3/M, z8.s, z19.s\n"
+    "fmla z27.s, p3/M, z7.s, z19.s\n"
+    "fmla z22.s, p3/M, z1.s, z19.s\n"
+    "fmla z23.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldr x23, [x17, #0xa8]\n"
+    "fmla z26.s, p3/M, z6.s, z16.s\n"
+    "fmla z25.s, p3/M, z7.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x22, [x17, #0xc0]\n"
+    "fmla z24.s, p3/M, z6.s, z18.s\n"
+    "fmla z21.s, p3/M, z4.s, z18.s\n"
+    "fmla z29.s, p3/M, z3.s, z20.s\n"
+    "fmla z27.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z18.s\n"
+    "fmla z22.s, p3/M, z3.s, z18.s\n"
+    "ldr x21, [x17, #0xb0]\n"
+    "ldr x20, [x17, #0xb8]\n"
+    "fmla z26.s, p3/M, z8.s, z18.s\n"
+    "fmla z24.s, p3/M, z8.s, z17.s\n"
+    "fmla z21.s, p3/M, z6.s, z16.s\n"
+    "fmla z28.s, p3/M, z3.s, z19.s\n"
+    "fmla z25.s, p3/M, z0.s, z19.s\n"
+    "fmla z22.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+    "fmla z23.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z19.s\n"
+    "fmla z26.s, p3/M, z1.s, z19.s\n"
+    "fmla z28.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z4.s, z17.s\n"
+    "fmla z25.s, p3/M, z2.s, z17.s\n"
+    "fmla z24.s, p3/M, z1.s, z17.s\n"
+    "fmla z21.s, p3/M, z8.s, z18.s\n"
+    "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "ldr x25, [x17, #0x20]\n"
+    "fmla z22.s, p3/M, z7.s, z18.s\n"
+    "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z2.s, z17.s\n"
+    "fmla z26.s, p3/M, z7.s, z16.s\n"
+    "fmla z25.s, p3/M, z6.s, z16.s\n"
+    "fmla z23.s, p3/M, z4.s, z16.s\n"
+    "fmla z21.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "fmla z22.s, p3/M, z4.s, z18.s\n"
+    "fmla z28.s, p3/M, z1.s, z17.s\n"
+    "fmax z28.s, p3/M, z28.s, z31.s\n"
+    "fmin z28.s, p3/M, z28.s, z30.s\n"
+    "fmla z27.s, p3/M, z0.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z6.s, z16.s\n"
+    "fmax z29.s, p3/M, z29.s, z31.s\n"
+    "fmla z24.s, p3/M, z7.s, z18.s\n"
+    "fmla z21.s, p3/M, z5.s, z18.s\n"
+    "fmin z29.s, p3/M, z29.s, z30.s\n"
+    "st1w { z29.s }, p1, [x10, x14, LSL #2]\n"
+    "fmla z23.s, p3/M, z0.s, z16.s\n"
+    "fmla z22.s, p3/M, z2.s, z17.s\n"
+    "ldr x24, [x13, #0x20]\n"
+    "st1w { z28.s }, p1, [x9, x14, LSL #2]\n"
+    "fmla z25.s, p3/M, z8.s, z18.s\n"
+    "fmla z26.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldp x23, x22, [x17, #0x0]\n"
+    "fmla z27.s, p3/M, z8.s, z17.s\n"
+    "fmla z24.s, p3/M, z5.s, z17.s\n"
+    "ldp x21, x20, [x17, #0x10]\n"
+    "fmax z27.s, p3/M, z27.s, z31.s\n"
+    "fmla z23.s, p3/M, z8.s, z16.s\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "fmax z26.s, p3/M, z26.s, z31.s\n"
+    "fmax z25.s, p3/M, z25.s, z31.s\n"
+    "fmla z22.s, p3/M, z6.s, z16.s\n"
+    "incw x15\n"
+    "ld1w { z9.s }, p0/Z, [x23, x16, LSL #2]\n"
+    "ld1w { z10.s }, p0/Z, [x22, x16, LSL #2]\n"
+    "ld1w { z11.s }, p0/Z, [x21, x16, LSL #2]\n"
+    "ld1w { z12.s }, p0/Z, [x20, x16, LSL #2]\n"
+    "fmin z27.s, p3/M, z27.s, z30.s\n"
+    "fmin z26.s, p3/M, z26.s, z30.s\n"
+    "ld1w { z13.s }, p0/Z, [x25, x16, LSL #2]\n"
+    "incw x16\n"
+    "fmin z25.s, p3/M, z25.s, z30.s\n"
+    "st1w { z27.s }, p1, [x28, x14, LSL #2]\n"
+    "fmax z24.s, p3/M, z24.s, z31.s\n"
+    "fmax z23.s, p3/M, z23.s, z31.s\n"
+    "st1w { z26.s }, p1, [x27, x14, LSL #2]\n"
+    "ldr x23, [x13, #0x28]\n"
+    "fmax z21.s, p3/M, z21.s, z31.s\n"
+    "fmax z22.s, p3/M, z22.s, z31.s\n"
+    "st1w { z25.s }, p1, [x24, x14, LSL #2]\n"
+    "ldr x22, [x13, #0x30]\n"
+    "ldr x21, [x13, #0x38]\n"
+    "ldr x20, [x13, #0x40]\n"
+    "whilelt p2.s, x15, %x[n_channels]\n"
+    "cmp x16, %x[n_channels]\n"
+    "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
+    "fmin z24.s, p3/M, z24.s, z30.s\n"
+    "fmin z23.s, p3/M, z23.s, z30.s\n"
+    "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+    "fmin z21.s, p3/M, z21.s, z30.s\n"
+    "fmin z22.s, p3/M, z22.s, z30.s\n"
+    "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
+    "st1w { z24.s }, p1, [x23, x14, LSL #2]\n"
+    "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
+    "addvl x8, x8, #16\n"
+    "st1w { z23.s }, p1, [x22, x14, LSL #2]\n"
+    "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
+    "st1w { z21.s }, p1, [x21, x14, LSL #2]\n"
+    "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
+    "addvl x8, x8, #-6\n"
+    "st1w { z22.s }, p1, [x20, x14, LSL #2]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z29, z14\n fmla z29.s, p3/M, z8.s, z9.s\n"
+    "movprfx z28, z14\n fmla z28.s, p3/M, z7.s, z9.s\n"
+    "ldr x23, [x17, #0x30]\n"
+    "ldr x26, [x17, #0x38]\n"
+    "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+    "fmla z29.s, p3/M, z0.s, z10.s\n"
+    "ldr x22, [x17, #0x28]\n"
+    "ldr x21, [x17, #0x48]\n"
+    "fmla z28.s, p3/M, z4.s, z13.s\n"
+    "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+    "ldr x20, [x17, #0x40]\n"
+    "ld1w { z19.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+    "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+    "ldr x25, [x17, #0x50]\n"
+    "ldr x24, [x17, #0x58]\n"
+    "fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "movprfx z23, z14\n fmla z23.s, p3/M, z2.s, z9.s\n"
+    "ldr x23, [x17, #0x60]\n"
+    "fmla z29.s, p3/M, z5.s, z13.s\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "ldr x12, [x17, #0x70]\n"
+    "ldr x11, [x17, #0x88]\n"
+    "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
+    "fmla z27.s, p3/M, z3.s, z13.s\n"
+    "incw x14\n"
+    "mov p0.b, p2.b\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "fmla z25.s, p3/M, z1.s, z13.s\n"
+    "ldr x10, [x13, #0x0]\n"
+    "ldr x9, [x13, #0x8]\n"
+    "fmla z24.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "fmla z23.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
+    "fmla z29.s, p3/M, z7.s, z18.s\n"
+    "ldr x22, [x17, #0x68]\n"
+    "ldr x21, [x17, #0x78]\n"
+    "fmla z28.s, p3/M, z0.s, z17.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x20, [x17, #0x80]\n"
+    "fmla z26.s, p3/M, z4.s, z18.s\n"
+    "fmla z25.s, p3/M, z3.s, z18.s\n"
+    "ldr x28, [x13, #0x10]\n"
+    "ldr x27, [x13, #0x18]\n"
+    "fmla z21.s, p3/M, z0.s, z18.s\n"
+    "fmla z24.s, p3/M, z4.s, z19.s\n"
+    "fmla z23.s, p3/M, z1.s, z18.s\n"
+    "fmla z29.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z27.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x26, [x17, #0x90]\n"
+    "fmla z25.s, p3/M, z5.s, z19.s\n"
+    "fmla z21.s, p3/M, z2.s, z19.s\n"
+    "ldr x25, [x17, #0xa0]\n"
+    "ldr x24, [x17, #0x98]\n"
+    "fmla z26.s, p3/M, z0.s, z20.s\n"
+    "fmla z24.s, p3/M, z2.s, z17.s\n"
+    "fmla z28.s, p3/M, z8.s, z19.s\n"
+    "fmla z27.s, p3/M, z7.s, z19.s\n"
+    "fmla z22.s, p3/M, z1.s, z19.s\n"
+    "fmla z23.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldr x23, [x17, #0xa8]\n"
+    "fmla z26.s, p3/M, z6.s, z16.s\n"
+    "fmla z25.s, p3/M, z7.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x22, [x17, #0xc0]\n"
+    "fmla z24.s, p3/M, z6.s, z18.s\n"
+    "fmla z21.s, p3/M, z4.s, z18.s\n"
+    "fmla z29.s, p3/M, z3.s, z20.s\n"
+    "fmla z27.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z18.s\n"
+    "fmla z22.s, p3/M, z3.s, z18.s\n"
+    "ldr x21, [x17, #0xb0]\n"
+    "ldr x20, [x17, #0xb8]\n"
+    "fmla z26.s, p3/M, z8.s, z18.s\n"
+    "fmla z24.s, p3/M, z8.s, z17.s\n"
+    "fmla z21.s, p3/M, z6.s, z16.s\n"
+    "fmla z28.s, p3/M, z3.s, z19.s\n"
+    "fmla z25.s, p3/M, z0.s, z19.s\n"
+    "fmla z22.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+    "fmla z23.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z19.s\n"
+    "fmla z26.s, p3/M, z1.s, z19.s\n"
+    "fmla z28.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z4.s, z17.s\n"
+    "fmla z25.s, p3/M, z2.s, z17.s\n"
+    "fmla z24.s, p3/M, z1.s, z17.s\n"
+    "fmla z21.s, p3/M, z8.s, z18.s\n"
+    "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z18.s\n"
+    "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z2.s, z17.s\n"
+    "fmla z26.s, p3/M, z7.s, z16.s\n"
+    "fmla z25.s, p3/M, z6.s, z16.s\n"
+    "fmla z23.s, p3/M, z4.s, z16.s\n"
+    "fmla z21.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "fmla z22.s, p3/M, z4.s, z18.s\n"
+    "fmla z28.s, p3/M, z1.s, z17.s\n"
+    "fmax z28.s, p3/M, z28.s, z31.s\n"
+    "fmin z28.s, p3/M, z28.s, z30.s\n"
+    "fmla z27.s, p3/M, z0.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z6.s, z16.s\n"
+    "fmax z29.s, p3/M, z29.s, z31.s\n"
+    "fmla z24.s, p3/M, z7.s, z18.s\n"
+    "fmla z21.s, p3/M, z5.s, z18.s\n"
+    "fmin z29.s, p3/M, z29.s, z30.s\n"
+    "st1w { z29.s }, p0, [x10, x14, LSL #2]\n"
+    "fmla z23.s, p3/M, z0.s, z16.s\n"
+    "fmla z22.s, p3/M, z2.s, z17.s\n"
+    "ldr x20, [x13, #0x20]\n"
+    "st1w { z28.s }, p0, [x9, x14, LSL #2]\n"
+    "fmla z25.s, p3/M, z8.s, z18.s\n"
+    "fmla z26.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "fmax z26.s, p3/M, z26.s, z31.s\n"
+    "fmla z27.s, p3/M, z8.s, z17.s\n"
+    "fmla z24.s, p3/M, z5.s, z17.s\n"
+    "fmax z27.s, p3/M, z27.s, z31.s\n"
+    "fmax z25.s, p3/M, z25.s, z31.s\n"
+    "fmla z23.s, p3/M, z8.s, z16.s\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "fmin z27.s, p3/M, z27.s, z30.s\n"
+    "fmin z26.s, p3/M, z26.s, z30.s\n"
+    "fmla z22.s, p3/M, z6.s, z16.s\n"
+    "fmin z25.s, p3/M, z25.s, z30.s\n"
+    "fmax z24.s, p3/M, z24.s, z31.s\n"
+    "st1w { z27.s }, p0, [x28, x14, LSL #2]\n"
+    "fmax z23.s, p3/M, z23.s, z31.s\n"
+    "fmax z21.s, p3/M, z21.s, z31.s\n"
+    "st1w { z26.s }, p0, [x27, x14, LSL #2]\n"
+    "ldr x23, [x13, #0x28]\n"
+    "fmax z22.s, p3/M, z22.s, z31.s\n"
+    "st1w { z25.s }, p0, [x20, x14, LSL #2]\n"
+    "ldr x22, [x13, #0x30]\n"
+    "ldr x21, [x13, #0x38]\n"
+    "ldr x20, [x13, #0x40]\n"
+    "fmin z24.s, p3/M, z24.s, z30.s\n"
+    "fmin z23.s, p3/M, z23.s, z30.s\n"
+    "st1w { z24.s }, p0, [x23, x14, LSL #2]\n"
+    "fmin z21.s, p3/M, z21.s, z30.s\n"
+    "fmin z22.s, p3/M, z22.s, z30.s\n"
+    "st1w { z23.s }, p0, [x22, x14, LSL #2]\n"
+    "st1w { z21.s }, p0, [x21, x14, LSL #2]\n"
+    "st1w { z22.s }, p0, [x20, x14, LSL #2]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..38b377509e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 4;
+
+  sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..35445595f8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,656 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x16, #0x0\n"
+    "mov x4, #0x0\n"
+    "1:"  // Tile loop
+    "str x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x4\n"
+    "mov x24, #0x4\n"
+    "str x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x16, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x5, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x16, x22\n"  // offset = tile_i * ld_output_row
+    "add x7, x5, x5\n"
+    "madd x21, x4, x5, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    "cntw x16\n"
+    "madd x20, x4, x6, x20\n"  // offset += tile_j * ld_output_col
+    "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x14, x7, x5\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "mul x21, x21, x25\n"  // offset *= kernel_stride * output_size
+    "add x8, x8, x21, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x13, x8, x23, LSL #2\n"
+    "ld1w { z19.s }, p3/Z, [x17]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "add x12, x13, x23, LSL #2\n"
+    "add x15, x15, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+    "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
+    "add x11, x12, x23, LSL #2\n"
+    "add x10, x14, x5\n"
+    "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
+    "add x9, x15, x22, LSL #2\n"
+    "add x28, x11, x23, LSL #2\n"
+    "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
+    "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
+    "addvl x17, x17, #16\n"
+    "add x27, x10, x5\n"
+    "add x26, x9, x22, LSL #2\n"
+    "add x25, x6, x6\n"
+    "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "cmp x16, %x[n_channels]\n"
+    "add x24, x28, x23, LSL #2\n"
+    "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
+    "add x23, x26, x22, LSL #2\n"
+    "add x22, x25, x6\n"
+    "ld1w { z9.s }, p2/Z, [x12, x7, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x8]\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x16\n"
+    "ld1w { z11.s }, p2/Z, [x8, x27, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x12, x14, LSL #2]\n"
+    "addvl x17, x17, #-6\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z14, z19\n fmla z14.s, p3/M, z4.s, z9.s\n"
+    "movprfx z31, z19\n fmla z31.s, p3/M, z8.s, z9.s\n"
+    "whilelt p1.s, x16, %x[n_channels]\n"
+    "incw x21\n"
+    "movprfx z21, z19\n fmla z21.s, p3/M, z3.s, z9.s\n"
+    "movprfx z22, z19\n fmla z22.s, p3/M, z1.s, z9.s\n"
+    "incw x16\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z20, z19\n fmla z20.s, p3/M, z0.s, z9.s\n"
+    "fmla z14.s, p3/M, z5.s, z12.s\n"
+    "incw x20\n"
+    "movprfx z13, z19\n fmla z13.s, p3/M, z7.s, z9.s\n"
+    "movprfx z17, z19\n fmla z17.s, p3/M, z6.s, z9.s\n"
+    "movprfx z27, z19\n fmla z27.s, p3/M, z5.s, z9.s\n"
+    "movprfx z18, z19\n fmla z18.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x11, x7, LSL #2]\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z29.s }, p2/Z, [x24]\n"
+    "ld1w { z11.s }, p2/Z, [x24, x27, LSL #2]\n"
+    "fmla z21.s, p3/M, z4.s, z12.s\n"
+    "fmla z22.s, p3/M, z2.s, z12.s\n"
+    "fmla z20.s, p3/M, z1.s, z12.s\n"
+    "movprfx z23, z19\n fmla z23.s, p3/M, z6.s, z29.s\n"
+    "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
+    "fmla z14.s, p3/M, z7.s, z9.s\n"
+    "fmla z13.s, p3/M, z8.s, z12.s\n"
+    "fmla z17.s, p3/M, z7.s, z12.s\n"
+    "fmla z30.s, p3/M, z6.s, z12.s\n"
+    "movprfx z26, z19\n fmla z26.s, p3/M, z3.s, z12.s\n"
+    "movprfx z28, z19\n fmla z28.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x8, x5, LSL #2]\n"
+    "movprfx z24, z19\n fmla z24.s, p3/M, z8.s, z11.s\n"
+    "fmla z21.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z11.s }, p2/Z, [x8, x10, LSL #2]\n"
+    "fmla z22.s, p3/M, z4.s, z9.s\n"
+    "fmla z20.s, p3/M, z3.s, z9.s\n"
+    "movprfx z25, z19\n fmla z25.s, p3/M, z1.s, z9.s\n"
+    "movprfx z29, z19\n fmla z29.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z19.s }, p3/Z, [x17]\n"
+    "fmla z27.s, p3/M, z8.s, z9.s\n"
+    "fmla z18.s, p3/M, z5.s, z9.s\n"
+    "fmla z23.s, p3/M, z2.s, z9.s\n"
+    "fmla z14.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z9.s }, p2/Z, [x13]\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "fmla z13.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n"
+    "fmla z17.s, p3/M, z2.s, z11.s\n"
+    "fmla z30.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x28]\n"
+    "fmla z21.s, p3/M, z7.s, z10.s\n"
+    "fmla z26.s, p3/M, z6.s, z10.s\n"
+    "fmla z22.s, p3/M, z5.s, z10.s\n"
+    "fmla z20.s, p3/M, z4.s, z10.s\n"
+    "fmla z28.s, p3/M, z3.s, z10.s\n"
+    "fmla z25.s, p3/M, z2.s, z10.s\n"
+    "fmla z29.s, p3/M, z1.s, z10.s\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x13, x7, LSL #2]\n"
+    "fmla z27.s, p3/M, z0.s, z9.s\n"
+    "fmla z18.s, p3/M, z6.s, z11.s\n"
+    "fmla z23.s, p3/M, z3.s, z11.s\n"
+    "fmla z14.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z11.s }, p2/Z, [x28, x27, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z9.s\n"
+    "fmla z30.s, p3/M, z5.s, z12.s\n"
+    "fmla z26.s, p3/M, z2.s, z12.s\n"
+    "fmla z13.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z9.s }, p2/Z, [x13, x14, LSL #2]\n"
+    "fmla z17.s, p3/M, z3.s, z10.s\n"
+    "fmla z21.s, p3/M, z0.s, z10.s\n"
+    "fmla z28.s, p3/M, z8.s, z11.s\n"
+    "fmla z24.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x5, LSL #2]\n"
+    "fmla z27.s, p3/M, z2.s, z10.s\n"
+    "fmla z14.s, p3/M, z2.s, z9.s\n"
+    "fmla z31.s, p3/M, z5.s, z10.s\n"
+    "fmla z13.s, p3/M, z5.s, z9.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x5, LSL #2]\n"
+    "fmla z17.s, p3/M, z4.s, z9.s\n"
+    "fmla z30.s, p3/M, z3.s, z9.s\n"
+    "fmla z21.s, p3/M, z1.s, z9.s\n"
+    "fmla z26.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z10.s }, p2/Z, [x12, x10, LSL #2]\n"
+    "fmla z23.s, p3/M, z7.s, z12.s\n"
+    "fmla z25.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
+    "fmla z27.s, p3/M, z4.s, z11.s\n"
+    "fmla z14.s, p3/M, z3.s, z11.s\n"
+    "fmla z18.s, p3/M, z1.s, z11.s\n"
+    "fmla z22.s, p3/M, z0.s, z11.s\n"
+    "fmla z31.s, p3/M, z7.s, z11.s\n"
+    "fmla z13.s, p3/M, z6.s, z11.s\n"
+    "ld1w { z9.s }, p2/Z, [x8, x7, LSL #2]\n"
+    "fmla z29.s, p3/M, z8.s, z12.s\n"
+    "fmla z24.s, p3/M, z7.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x11, x5, LSL #2]\n"
+    "fmla z17.s, p3/M, z8.s, z10.s\n"
+    "fmla z30.s, p3/M, z7.s, z10.s\n"
+    "fmla z21.s, p3/M, z5.s, z10.s\n"
+    "fmla z26.s, p3/M, z4.s, z10.s\n"
+    "fmla z20.s, p3/M, z2.s, z10.s\n"
+    "fmla z28.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z11.s }, p2/Z, [x8, x14, LSL #2]\n"
+    "addvl x8, x8, #1\n"
+    "fmla z27.s, p3/M, z7.s, z12.s\n"
+    "fmla z14.s, p3/M, z6.s, z12.s\n"
+    "fmla z18.s, p3/M, z4.s, z12.s\n"
+    "fmla z22.s, p3/M, z3.s, z12.s\n"
+    "fmla z23.s, p3/M, z1.s, z12.s\n"
+    "fmla z25.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x11, x10, LSL #2]\n"
+    "fmla z31.s, p3/M, z2.s, z9.s\n"
+    "fmla z13.s, p3/M, z1.s, z9.s\n"
+    "fmla z17.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x12]\n"
+    "fmla z29.s, p3/M, z2.s, z12.s\n"
+    "fmla z30.s, p3/M, z0.s, z11.s\n"
+    "fmla z27.s, p3/M, z3.s, z9.s\n"
+    "fmla z18.s, p3/M, z0.s, z9.s\n"
+    "fmla z21.s, p3/M, z8.s, z12.s\n"
+    "fmla z26.s, p3/M, z7.s, z12.s\n"
+    "fmla z20.s, p3/M, z5.s, z12.s\n"
+    "fmla z28.s, p3/M, z4.s, z12.s\n"
+    "fmla z24.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x28, x7, LSL #2]\n"
+    "fmla z13.s, p3/M, z2.s, z11.s\n"
+    "fmla z17.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n"
+    "addvl x12, x12, #1\n"
+    "fmla z31.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x11]\n"
+    "fmla z25.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z9.s }, p1/Z, [x12, x7, LSL #2]\n"
+    "fmla z29.s, p3/M, z3.s, z10.s\n"
+    "fmla z30.s, p3/M, z8.s, z11.s\n"
+    "fmla z26.s, p3/M, z5.s, z11.s\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n"
+    "addvl x11, x11, #1\n"
+    "fmla z27.s, p3/M, z6.s, z12.s\n"
+    "fmla z18.s, p3/M, z3.s, z12.s\n"
+    "fmla z23.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x7, LSL #2]\n"
+    "fmla z24.s, p3/M, z2.s, z11.s\n"
+    "fmla z25.s, p3/M, z7.s, z12.s\n"
+    "fmla z29.s, p3/M, z6.s, z12.s\n"
+    "fmla z18.s, p3/M, z8.s, z10.s\n"
+    "fmla z22.s, p3/M, z7.s, z10.s\n"
+    "fmla z20.s, p3/M, z6.s, z10.s\n"
+    "fmla z23.s, p3/M, z5.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x28, x14, LSL #2]\n"
+    "fmla z28.s, p3/M, z5.s, z11.s\n"
+    "fmla z25.s, p3/M, z5.s, z10.s\n"
+    "fmla z29.s, p3/M, z4.s, z10.s\n"
+    "fmla z24.s, p3/M, z3.s, z10.s\n"
+    "fmla z26.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x24, x14, LSL #2]\n"
+    "fmla z23.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x13, x5, LSL #2]\n"
+    "fmla z22.s, p3/M, z8.s, z10.s\n"
+    "fmla z20.s, p3/M, z7.s, z10.s\n"
+    "addvl x24, x24, #1\n"
+    "fmla z28.s, p3/M, z6.s, z10.s\n"
+    "fmla z25.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z10.s }, p2/Z, [x13, x10, LSL #2]\n"
+    "addvl x13, x13, #1\n"
+    "fmla z29.s, p3/M, z7.s, z11.s\n"
+    "fmla z24.s, p3/M, z6.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x28, x5, LSL #2]\n"
+    "fmla z31.s, p3/M, z4.s, z12.s\n"
+    "fmla z13.s, p3/M, z3.s, z12.s\n"
+    "fmax z31.s, p3/M, z31.s, z15.s\n"
+    "fmax z13.s, p3/M, z13.s, z15.s\n"
+    "fmla z27.s, p3/M, z1.s, z12.s\n"
+    "fmla z14.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z0.s }, p2/Z, [x28, x10, LSL #2]\n"
+    "fmax z27.s, p3/M, z27.s, z15.s\n"
+    "fmla z17.s, p3/M, z5.s, z10.s\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "fmax z17.s, p3/M, z17.s, z15.s\n"
+    "fmax z30.s, p3/M, z30.s, z15.s\n"
+    "fmla z21.s, p3/M, z2.s, z10.s\n"
+    "fmla z26.s, p3/M, z1.s, z10.s\n"
+    "fmax z14.s, p3/M, z14.s, z15.s\n"
+    "fmax z21.s, p3/M, z21.s, z15.s\n"
+    "fmla z18.s, p3/M, z7.s, z11.s\n"
+    "fmla z22.s, p3/M, z6.s, z11.s\n"
+    "fmax z26.s, p3/M, z26.s, z15.s\n"
+    "fmax z18.s, p3/M, z18.s, z15.s\n"
+    "fmla z23.s, p3/M, z4.s, z11.s\n"
+    "fmla z25.s, p3/M, z3.s, z11.s\n"
+    "fmax z22.s, p3/M, z22.s, z15.s\n"
+    "fmax z23.s, p3/M, z23.s, z15.s\n"
+    "fmla z20.s, p3/M, z8.s, z0.s\n"
+    "fmla z28.s, p3/M, z7.s, z0.s\n"
+    "fmax z20.s, p3/M, z20.s, z15.s\n"
+    "fmax z28.s, p3/M, z28.s, z15.s\n"
+    "fmla z29.s, p3/M, z5.s, z0.s\n"
+    "fmla z24.s, p3/M, z4.s, z0.s\n"
+    "fmax z25.s, p3/M, z25.s, z15.s\n"
+    "fmax z29.s, p3/M, z29.s, z15.s\n"
+    "fmax z24.s, p3/M, z24.s, z15.s\n"
+    "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
+    "cmp x16, %x[n_channels]\n"
+    "fmin z31.s, p3/M, z31.s, z16.s\n"
+    "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
+    "fmin z13.s, p3/M, z13.s, z16.s\n"
+    "fmin z17.s, p3/M, z17.s, z16.s\n"
+    "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
+    "addvl x17, x17, #16\n"
+    "fmin z30.s, p3/M, z30.s, z16.s\n"
+    "ld1w { z10.s }, p1/Z, [x8]\n"
+    "fmin z27.s, p3/M, z27.s, z16.s\n"
+    "fmin z14.s, p3/M, z14.s, z16.s\n"
+    "ld1w { z11.s }, p1/Z, [x8, x27, LSL #2]\n"
+    "ld1w { z12.s }, p1/Z, [x12, x14, LSL #2]\n"
+    "fmin z21.s, p3/M, z21.s, z16.s\n"
+    "fmin z26.s, p3/M, z26.s, z16.s\n"
+    "st1w { z31.s }, p0, [x15]\n"
+    "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
+    "fmin z18.s, p3/M, z18.s, z16.s\n"
+    "fmin z22.s, p3/M, z22.s, z16.s\n"
+    "st1w { z13.s }, p0, [x15, x6, LSL #2]\n"
+    "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
+    "fmin z20.s, p3/M, z20.s, z16.s\n"
+    "fmin z28.s, p3/M, z28.s, z16.s\n"
+    "st1w { z17.s }, p0, [x15, x25, LSL #2]\n"
+    "fmin z23.s, p3/M, z23.s, z16.s\n"
+    "fmin z25.s, p3/M, z25.s, z16.s\n"
+    "st1w { z30.s }, p0, [x15, x22, LSL #2]\n"
+    "fmin z29.s, p3/M, z29.s, z16.s\n"
+    "fmin z24.s, p3/M, z24.s, z16.s\n"
+    "st1w { z27.s }, p0, [x9]\n"
+    "addvl x28, x28, #1\n"
+    "st1w { z14.s }, p0, [x9, x6, LSL #2]\n"
+    "addvl x15, x15, #1\n"
+    "st1w { z21.s }, p0, [x9, x25, LSL #2]\n"
+    "addvl x17, x17, #-6\n"
+    "st1w { z26.s }, p0, [x9, x22, LSL #2]\n"
+    "addvl x9, x9, #1\n"
+    "st1w { z18.s }, p0, [x26]\n"
+    "st1w { z22.s }, p0, [x26, x6, LSL #2]\n"
+    "st1w { z20.s }, p0, [x26, x25, LSL #2]\n"
+    "st1w { z28.s }, p0, [x26, x22, LSL #2]\n"
+    "addvl x26, x26, #1\n"
+    "st1w { z23.s }, p0, [x23]\n"
+    "st1w { z25.s }, p0, [x23, x6, LSL #2]\n"
+    "st1w { z29.s }, p0, [x23, x25, LSL #2]\n"
+    "st1w { z24.s }, p0, [x23, x22, LSL #2]\n"
+    "addvl x23, x23, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z14, z19\n fmla z14.s, p3/M, z4.s, z9.s\n"
+    "movprfx z31, z19\n fmla z31.s, p3/M, z8.s, z9.s\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "movprfx z30, z19\n fmla z30.s, p3/M, z3.s, z9.s\n"
+    "movprfx z13, z19\n fmla z13.s, p3/M, z1.s, z9.s\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "add x4, x4, #0x1\n"
+    "movprfx z20, z19\n fmla z20.s, p3/M, z0.s, z9.s\n"
+    "fmla z14.s, p3/M, z5.s, z12.s\n"
+    "cmp x4, x20\n"
+    "add x21, x16, #0x1\n"
+    "movprfx z18, z19\n fmla z18.s, p3/M, z7.s, z9.s\n"
+    "movprfx z28, z19\n fmla z28.s, p3/M, z6.s, z9.s\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x16, x16, x21, LT\n"
+    "movprfx z17, z19\n fmla z17.s, p3/M, z5.s, z9.s\n"
+    "movprfx z26, z19\n fmla z26.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x11, x7, LSL #2]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "movprfx z27, z19\n fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z29.s }, p2/Z, [x24]\n"
+    "ld1w { z21.s }, p2/Z, [x24, x27, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z12.s\n"
+    "fmla z13.s, p3/M, z2.s, z12.s\n"
+    "csel x4, x4, XZR, LT\n"
+    "cmp x16, x20\n"
+    "fmla z20.s, p3/M, z1.s, z12.s\n"
+    "movprfx z10, z19\n fmla z10.s, p3/M, z6.s, z29.s\n"
+    "ld1w { z29.s }, p2/Z, [x11, x14, LSL #2]\n"
+    "fmla z14.s, p3/M, z7.s, z9.s\n"
+    "fmla z18.s, p3/M, z8.s, z12.s\n"
+    "fmla z28.s, p3/M, z7.s, z12.s\n"
+    "fmla z27.s, p3/M, z6.s, z12.s\n"
+    "movprfx z11, z19\n fmla z11.s, p3/M, z3.s, z12.s\n"
+    "movprfx z25, z19\n fmla z25.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z22.s }, p2/Z, [x8, x5, LSL #2]\n"
+    "movprfx z24, z19\n fmla z24.s, p3/M, z8.s, z21.s\n"
+    "fmla z30.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z21.s }, p2/Z, [x8, x10, LSL #2]\n"
+    "fmla z13.s, p3/M, z4.s, z9.s\n"
+    "fmla z20.s, p3/M, z3.s, z9.s\n"
+    "movprfx z12, z19\n fmla z12.s, p3/M, z1.s, z9.s\n"
+    "movprfx z23, z19\n fmla z23.s, p3/M, z0.s, z9.s\n"
+    "fmla z17.s, p3/M, z8.s, z9.s\n"
+    "fmla z26.s, p3/M, z5.s, z9.s\n"
+    "fmla z10.s, p3/M, z2.s, z9.s\n"
+    "fmla z14.s, p3/M, z8.s, z29.s\n"
+    "ld1w { z9.s }, p2/Z, [x13]\n"
+    "fmla z31.s, p3/M, z1.s, z22.s\n"
+    "fmla z18.s, p3/M, z0.s, z22.s\n"
+    "ld1w { z22.s }, p2/Z, [x13, x27, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z21.s\n"
+    "fmla z27.s, p3/M, z1.s, z21.s\n"
+    "ld1w { z19.s }, p2/Z, [x28]\n"
+    "fmla z30.s, p3/M, z7.s, z29.s\n"
+    "fmla z11.s, p3/M, z6.s, z29.s\n"
+    "fmla z13.s, p3/M, z5.s, z29.s\n"
+    "fmla z20.s, p3/M, z4.s, z29.s\n"
+    "fmla z25.s, p3/M, z3.s, z29.s\n"
+    "fmla z12.s, p3/M, z2.s, z29.s\n"
+    "fmla z23.s, p3/M, z1.s, z29.s\n"
+    "fmla z24.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z21.s }, p2/Z, [x13, x7, LSL #2]\n"
+    "fmla z17.s, p3/M, z0.s, z9.s\n"
+    "fmla z26.s, p3/M, z6.s, z19.s\n"
+    "fmla z10.s, p3/M, z3.s, z19.s\n"
+    "fmla z14.s, p3/M, z1.s, z21.s\n"
+    "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z9.s\n"
+    "fmla z27.s, p3/M, z5.s, z22.s\n"
+    "fmla z11.s, p3/M, z2.s, z22.s\n"
+    "fmla z18.s, p3/M, z4.s, z21.s\n"
+    "ld1w { z29.s }, p2/Z, [x13, x14, LSL #2]\n"
+    "fmla z28.s, p3/M, z3.s, z21.s\n"
+    "fmla z30.s, p3/M, z0.s, z21.s\n"
+    "fmla z25.s, p3/M, z8.s, z19.s\n"
+    "fmla z24.s, p3/M, z5.s, z19.s\n"
+    "ld1w { z19.s }, p2/Z, [x24, x5, LSL #2]\n"
+    "fmla z17.s, p3/M, z2.s, z21.s\n"
+    "fmla z14.s, p3/M, z2.s, z29.s\n"
+    "fmla z31.s, p3/M, z5.s, z21.s\n"
+    "fmla z18.s, p3/M, z5.s, z29.s\n"
+    "ld1w { z22.s }, p2/Z, [x12, x5, LSL #2]\n"
+    "fmla z28.s, p3/M, z4.s, z29.s\n"
+    "fmla z27.s, p3/M, z3.s, z29.s\n"
+    "fmla z30.s, p3/M, z1.s, z29.s\n"
+    "fmla z11.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z21.s }, p2/Z, [x12, x10, LSL #2]\n"
+    "fmla z10.s, p3/M, z7.s, z19.s\n"
+    "fmla z12.s, p3/M, z6.s, z19.s\n"
+    "ld1w { z19.s }, p2/Z, [x24, x10, LSL #2]\n"
+    "fmla z17.s, p3/M, z4.s, z22.s\n"
+    "fmla z14.s, p3/M, z3.s, z22.s\n"
+    "fmla z26.s, p3/M, z1.s, z22.s\n"
+    "fmla z13.s, p3/M, z0.s, z22.s\n"
+    "fmla z31.s, p3/M, z7.s, z22.s\n"
+    "fmla z18.s, p3/M, z6.s, z22.s\n"
+    "ld1w { z29.s }, p2/Z, [x8, x7, LSL #2]\n"
+    "fmla z23.s, p3/M, z8.s, z19.s\n"
+    "fmla z24.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z19.s }, p2/Z, [x11, x5, LSL #2]\n"
+    "fmla z28.s, p3/M, z8.s, z21.s\n"
+    "fmla z27.s, p3/M, z7.s, z21.s\n"
+    "fmla z30.s, p3/M, z5.s, z21.s\n"
+    "fmla z11.s, p3/M, z4.s, z21.s\n"
+    "fmla z20.s, p3/M, z2.s, z21.s\n"
+    "fmla z25.s, p3/M, z1.s, z21.s\n"
+    "ld1w { z22.s }, p2/Z, [x8, x14, LSL #2]\n"
+    "fmla z17.s, p3/M, z7.s, z19.s\n"
+    "fmla z14.s, p3/M, z6.s, z19.s\n"
+    "fmla z26.s, p3/M, z4.s, z19.s\n"
+    "fmla z13.s, p3/M, z3.s, z19.s\n"
+    "fmla z10.s, p3/M, z1.s, z19.s\n"
+    "fmla z12.s, p3/M, z0.s, z19.s\n"
+    "ld1w { z21.s }, p2/Z, [x11, x10, LSL #2]\n"
+    "fmla z31.s, p3/M, z2.s, z29.s\n"
+    "fmla z18.s, p3/M, z1.s, z29.s\n"
+    "fmla z28.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z29.s }, p2/Z, [x12]\n"
+    "fmla z23.s, p3/M, z2.s, z21.s\n"
+    "fmla z27.s, p3/M, z0.s, z22.s\n"
+    "fmla z17.s, p3/M, z3.s, z29.s\n"
+    "fmla z26.s, p3/M, z0.s, z29.s\n"
+    "fmla z30.s, p3/M, z8.s, z21.s\n"
+    "fmla z11.s, p3/M, z7.s, z21.s\n"
+    "fmla z20.s, p3/M, z5.s, z21.s\n"
+    "fmla z25.s, p3/M, z4.s, z21.s\n"
+    "fmla z24.s, p3/M, z1.s, z21.s\n"
+    "ld1w { z19.s }, p2/Z, [x28, x7, LSL #2]\n"
+    "fmla z18.s, p3/M, z2.s, z22.s\n"
+    "fmla z28.s, p3/M, z1.s, z22.s\n"
+    "ld1w { z21.s }, p2/Z, [x12, x27, LSL #2]\n"
+    "fmla z31.s, p3/M, z6.s, z29.s\n"
+    "ld1w { z29.s }, p2/Z, [x11]\n"
+    "fmla z12.s, p3/M, z4.s, z19.s\n"
+    "fmla z23.s, p3/M, z3.s, z19.s\n"
+    "fmla z27.s, p3/M, z8.s, z21.s\n"
+    "fmla z11.s, p3/M, z5.s, z21.s\n"
+    "fmla z25.s, p3/M, z2.s, z21.s\n"
+    "ld1w { z9.s }, p2/Z, [x11, x27, LSL #2]\n"
+    "fmla z17.s, p3/M, z6.s, z29.s\n"
+    "fmla z26.s, p3/M, z3.s, z29.s\n"
+    "fmla z10.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z22.s }, p2/Z, [x24, x7, LSL #2]\n"
+    "fmla z24.s, p3/M, z2.s, z9.s\n"
+    "fmla z12.s, p3/M, z7.s, z22.s\n"
+    "fmla z23.s, p3/M, z6.s, z22.s\n"
+    "fmla z26.s, p3/M, z8.s, z19.s\n"
+    "fmla z13.s, p3/M, z7.s, z19.s\n"
+    "fmla z20.s, p3/M, z6.s, z19.s\n"
+    "fmla z10.s, p3/M, z5.s, z19.s\n"
+    "ld1w { z21.s }, p2/Z, [x28, x14, LSL #2]\n"
+    "fmla z25.s, p3/M, z5.s, z9.s\n"
+    "fmla z12.s, p3/M, z5.s, z21.s\n"
+    "fmla z23.s, p3/M, z4.s, z21.s\n"
+    "fmla z24.s, p3/M, z3.s, z21.s\n"
+    "fmla z11.s, p3/M, z8.s, z9.s\n"
+    "ld1w { z19.s }, p2/Z, [x24, x14, LSL #2]\n"
+    "fmla z10.s, p3/M, z8.s, z22.s\n"
+    "ld1w { z22.s }, p2/Z, [x13, x5, LSL #2]\n"
+    "fmla z13.s, p3/M, z8.s, z21.s\n"
+    "fmla z20.s, p3/M, z7.s, z21.s\n"
+    "fmla z25.s, p3/M, z6.s, z21.s\n"
+    "fmla z12.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z29.s }, p2/Z, [x13, x10, LSL #2]\n"
+    "fmla z23.s, p3/M, z7.s, z19.s\n"
+    "fmla z24.s, p3/M, z6.s, z19.s\n"
+    "ld1w { z21.s }, p2/Z, [x28, x5, LSL #2]\n"
+    "fmla z31.s, p3/M, z4.s, z22.s\n"
+    "fmla z18.s, p3/M, z3.s, z22.s\n"
+    "fmax z31.s, p3/M, z31.s, z15.s\n"
+    "fmax z18.s, p3/M, z18.s, z15.s\n"
+    "fmla z17.s, p3/M, z1.s, z22.s\n"
+    "fmla z14.s, p3/M, z0.s, z22.s\n"
+    "ld1w { z9.s }, p2/Z, [x28, x10, LSL #2]\n"
+    "fmax z17.s, p3/M, z17.s, z15.s\n"
+    "fmla z28.s, p3/M, z5.s, z29.s\n"
+    "fmla z27.s, p3/M, z4.s, z29.s\n"
+    "fmax z28.s, p3/M, z28.s, z15.s\n"
+    "fmax z27.s, p3/M, z27.s, z15.s\n"
+    "fmla z30.s, p3/M, z2.s, z29.s\n"
+    "fmla z11.s, p3/M, z1.s, z29.s\n"
+    "fmax z14.s, p3/M, z14.s, z15.s\n"
+    "fmax z30.s, p3/M, z30.s, z15.s\n"
+    "fmla z26.s, p3/M, z7.s, z21.s\n"
+    "fmla z13.s, p3/M, z6.s, z21.s\n"
+    "fmax z11.s, p3/M, z11.s, z15.s\n"
+    "fmax z26.s, p3/M, z26.s, z15.s\n"
+    "fmla z10.s, p3/M, z4.s, z21.s\n"
+    "fmla z12.s, p3/M, z3.s, z21.s\n"
+    "fmax z13.s, p3/M, z13.s, z15.s\n"
+    "fmax z10.s, p3/M, z10.s, z15.s\n"
+    "fmla z20.s, p3/M, z8.s, z9.s\n"
+    "fmla z25.s, p3/M, z7.s, z9.s\n"
+    "fmax z20.s, p3/M, z20.s, z15.s\n"
+    "fmax z25.s, p3/M, z25.s, z15.s\n"
+    "fmla z23.s, p3/M, z5.s, z9.s\n"
+    "fmla z24.s, p3/M, z4.s, z9.s\n"
+    "fmax z12.s, p3/M, z12.s, z15.s\n"
+    "fmax z23.s, p3/M, z23.s, z15.s\n"
+    "fmax z24.s, p3/M, z24.s, z15.s\n"
+    "fmin z31.s, p3/M, z31.s, z16.s\n"
+    "st1w { z31.s }, p0, [x15]\n"
+    "fmin z18.s, p3/M, z18.s, z16.s\n"
+    "fmin z28.s, p3/M, z28.s, z16.s\n"
+    "st1w { z18.s }, p0, [x15, x6, LSL #2]\n"
+    "fmin z27.s, p3/M, z27.s, z16.s\n"
+    "fmin z17.s, p3/M, z17.s, z16.s\n"
+    "st1w { z28.s }, p0, [x15, x25, LSL #2]\n"
+    "fmin z14.s, p3/M, z14.s, z16.s\n"
+    "fmin z30.s, p3/M, z30.s, z16.s\n"
+    "st1w { z27.s }, p0, [x15, x22, LSL #2]\n"
+    "fmin z11.s, p3/M, z11.s, z16.s\n"
+    "fmin z26.s, p3/M, z26.s, z16.s\n"
+    "st1w { z17.s }, p0, [x9]\n"
+    "fmin z13.s, p3/M, z13.s, z16.s\n"
+    "fmin z20.s, p3/M, z20.s, z16.s\n"
+    "st1w { z14.s }, p0, [x9, x6, LSL #2]\n"
+    "fmin z25.s, p3/M, z25.s, z16.s\n"
+    "fmin z10.s, p3/M, z10.s, z16.s\n"
+    "st1w { z30.s }, p0, [x9, x25, LSL #2]\n"
+    "fmin z12.s, p3/M, z12.s, z16.s\n"
+    "fmin z23.s, p3/M, z23.s, z16.s\n"
+    "st1w { z11.s }, p0, [x9, x22, LSL #2]\n"
+    "fmin z24.s, p3/M, z24.s, z16.s\n"
+    "st1w { z26.s }, p0, [x26]\n"
+    "st1w { z13.s }, p0, [x26, x6, LSL #2]\n"
+    "st1w { z20.s }, p0, [x26, x25, LSL #2]\n"
+    "st1w { z25.s }, p0, [x26, x22, LSL #2]\n"
+    "st1w { z10.s }, p0, [x23]\n"
+    "st1w { z12.s }, p0, [x23, x6, LSL #2]\n"
+    "st1w { z23.s }, p0, [x23, x25, LSL #2]\n"
+    "st1w { z24.s }, p0, [x23, x22, LSL #2]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..3db248924f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,714 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[36];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[14];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[5];
+      inptrs[3] = input_ptrs[15];
+      inptrs[4] = input_ptrs[30];
+      inptrs[5] = input_ptrs[35];
+      inptrs[6] = input_ptrs[20];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[4];
+      inptrs[9] = input_ptrs[21];
+      inptrs[10] = input_ptrs[6];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[24];
+      inptrs[13] = input_ptrs[8];
+      inptrs[14] = input_ptrs[29];
+      inptrs[15] = input_ptrs[9];
+      inptrs[16] = input_ptrs[31];
+      inptrs[17] = input_ptrs[13];
+      inptrs[18] = input_ptrs[34];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[19];
+      inptrs[22] = input_ptrs[3];
+      inptrs[23] = input_ptrs[12];
+      inptrs[24] = input_ptrs[22];
+      inptrs[25] = input_ptrs[17];
+      inptrs[26] = input_ptrs[18];
+      inptrs[27] = input_ptrs[26];
+      inptrs[28] = input_ptrs[23];
+      inptrs[29] = input_ptrs[32];
+      inptrs[30] = input_ptrs[27];
+      inptrs[31] = input_ptrs[33];
+      inptrs[32] = input_ptrs[7];
+      inptrs[33] = input_ptrs[10];
+      inptrs[34] = input_ptrs[25];
+      inptrs[35] = input_ptrs[28];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x8, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1w { z17.s }, p3/Z, [x7]\n"
+    "cntw x17\n"
+    "mov x16, #0x0\n"
+    "ld1w { z0.s }, p3/Z, [x7, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x7, #2, MUL VL]\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z2.s }, p3/Z, [x7, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x7, #4, MUL VL]\n"
+    "cmp x17, %x[n_channels]\n"
+    "ld1w { z4.s }, p3/Z, [x7, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x7, #6, MUL VL]\n"
+    "sub x15, XZR, x17\n"
+    "ld1w { z6.s }, p3/Z, [x7, #7, MUL VL]\n"
+    "addvl x7, x7, #16\n"
+    "ldp x23, x22, [x8, #0x0]\n"
+    "ldp x21, x20, [x8, #0x10]\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z19.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1w { z7.s }, p3/Z, [x7, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x7, #-7, MUL VL]\n"
+    "addvl x7, x7, #-6\n"
+    "ld1w { z9.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z20, z17\n fmla z20.s, p3/M, z4.s, z9.s\n"
+    "movprfx z26, z17\n fmla z26.s, p3/M, z8.s, z9.s\n"
+    "ldr x27, [x8, #0x20]\n"
+    "ldr x24, [x8, #0x30]\n"
+    "movprfx z24, z17\n fmla z24.s, p3/M, z3.s, z9.s\n"
+    "movprfx z30, z17\n fmla z30.s, p3/M, z1.s, z9.s\n"
+    "ldr x23, [x8, #0x28]\n"
+    "ldr x22, [x8, #0x38]\n"
+    "movprfx z31, z17\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "movprfx z22, z17\n fmla z22.s, p3/M, z7.s, z9.s\n"
+    "ldr x26, [x8, #0x40]\n"
+    "ldr x21, [x8, #0x48]\n"
+    "movprfx z27, z17\n fmla z27.s, p3/M, z6.s, z9.s\n"
+    "fmla z20.s, p3/M, z5.s, z12.s\n"
+    "ldr x25, [x8, #0x50]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "movprfx z14, z17\n fmla z14.s, p3/M, z5.s, z9.s\n"
+    "movprfx z23, z17\n fmla z23.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z25.s }, p2/Z, [x24, x16, LSL #2]\n"
+    "ldr x13, [x8, #0x70]\n"
+    "fmla z26.s, p3/M, z0.s, z10.s\n"
+    "movprfx z9, z17\n fmla z9.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z28.s }, p2/Z, [x27, x16, LSL #2]\n"
+    "ld1w { z21.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "fmla z24.s, p3/M, z4.s, z12.s\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "ldr x24, [x8, #0x60]\n"
+    "ldr x23, [x8, #0x68]\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "fmla z22.s, p3/M, z8.s, z12.s\n"
+    "incw x15\n"
+    "mov p1.b, p2.b\n"
+    "fmla z27.s, p3/M, z7.s, z12.s\n"
+    "movprfx z15, z17\n fmla z15.s, p3/M, z6.s, z28.s\n"
+    "ld1w { z10.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "ldr x28, [x8, #0x88]\n"
+    "fmla z20.s, p3/M, z7.s, z25.s\n"
+    "fmla z9.s, p3/M, z6.s, z12.s\n"
+    "ldr x12, [x14, #0x0]\n"
+    "ldr x11, [x14, #0x8]\n"
+    "movprfx z11, z17\n fmla z11.s, p3/M, z3.s, z12.s\n"
+    "movprfx z13, z17\n fmla z13.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "ldr x22, [x8, #0x78]\n"
+    "movprfx z28, z17\n fmla z28.s, p3/M, z8.s, z21.s\n"
+    "fmla z24.s, p3/M, z6.s, z25.s\n"
+    "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
+    "ldr x21, [x8, #0x80]\n"
+    "fmla z30.s, p3/M, z4.s, z25.s\n"
+    "fmla z31.s, p3/M, z3.s, z25.s\n"
+    "ldr x10, [x14, #0x10]\n"
+    "ldr x9, [x14, #0x18]\n"
+    "movprfx z18, z17\n fmla z18.s, p3/M, z1.s, z25.s\n"
+    "movprfx z21, z17\n fmla z21.s, p3/M, z0.s, z25.s\n"
+    "whilelt p0.s, x17, %x[n_channels]\n"
+    "ld1w { z17.s }, p3/Z, [x7]\n"
+    "fmla z14.s, p3/M, z8.s, z25.s\n"
+    "fmla z23.s, p3/M, z5.s, z25.s\n"
+    "fmla z15.s, p3/M, z2.s, z25.s\n"
+    "fmla z26.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
+    "ldr x27, [x8, #0x90]\n"
+    "fmla z22.s, p3/M, z0.s, z12.s\n"
+    "fmla z27.s, p3/M, z2.s, z29.s\n"
+    "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "ldr x20, [x8, #0x98]\n"
+    "fmla z20.s, p3/M, z8.s, z10.s\n"
+    "fmla z9.s, p3/M, z1.s, z29.s\n"
+    "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
+    "ldr x26, [x8, #0xa0]\n"
+    "fmla z24.s, p3/M, z7.s, z10.s\n"
+    "fmla z11.s, p3/M, z6.s, z10.s\n"
+    "fmla z30.s, p3/M, z5.s, z10.s\n"
+    "fmla z31.s, p3/M, z4.s, z10.s\n"
+    "fmla z13.s, p3/M, z3.s, z10.s\n"
+    "fmla z18.s, p3/M, z2.s, z10.s\n"
+    "fmla z21.s, p3/M, z1.s, z10.s\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "ldr x25, [x8, #0xa8]\n"
+    "fmla z26.s, p3/M, z3.s, z25.s\n"
+    "fmla z14.s, p3/M, z0.s, z25.s\n"
+    "fmla z23.s, p3/M, z6.s, z29.s\n"
+    "fmla z15.s, p3/M, z3.s, z29.s\n"
+    "ld1w { z25.s }, p2/Z, [x13, x16, LSL #2]\n"
+    "ldr x24, [x8, #0xb0]\n"
+    "fmla z22.s, p3/M, z4.s, z10.s\n"
+    "fmla z27.s, p3/M, z3.s, z10.s\n"
+    "fmla z20.s, p3/M, z1.s, z10.s\n"
+    "fmla z9.s, p3/M, z5.s, z12.s\n"
+    "fmla z11.s, p3/M, z2.s, z12.s\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "ldr x23, [x8, #0xb8]\n"
+    "fmla z13.s, p3/M, z8.s, z25.s\n"
+    "fmla z28.s, p3/M, z5.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "ldr x22, [x8, #0xc0]\n"
+    "fmla z26.s, p3/M, z5.s, z10.s\n"
+    "fmla z14.s, p3/M, z2.s, z10.s\n"
+    "ld1w { z29.s }, p2/Z, [x28, x16, LSL #2]\n"
+    "ldr x21, [x8, #0xc8]\n"
+    "fmla z22.s, p3/M, z5.s, z12.s\n"
+    "fmla z27.s, p3/M, z4.s, z12.s\n"
+    "fmla z20.s, p3/M, z2.s, z12.s\n"
+    "fmla z9.s, p3/M, z3.s, z12.s\n"
+    "fmla z24.s, p3/M, z1.s, z12.s\n"
+    "fmla z11.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "ldr x28, [x8, #0xd8]\n"
+    "fmla z15.s, p3/M, z7.s, z25.s\n"
+    "fmla z18.s, p3/M, z6.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [x27, x16, LSL #2]\n"
+    "ldr x20, [x8, #0xd0]\n"
+    "fmla z26.s, p3/M, z7.s, z29.s\n"
+    "fmla z22.s, p3/M, z6.s, z29.s\n"
+    "fmla z14.s, p3/M, z4.s, z29.s\n"
+    "fmla z20.s, p3/M, z3.s, z29.s\n"
+    "fmla z23.s, p3/M, z1.s, z29.s\n"
+    "fmla z30.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
+    "ldr x27, [x8, #0xe0]\n"
+    "fmla z27.s, p3/M, z8.s, z10.s\n"
+    "fmla z21.s, p3/M, z8.s, z25.s\n"
+    "fmla z28.s, p3/M, z7.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
+    "fmla z13.s, p3/M, z1.s, z10.s\n"
+    "ldr x26, [x8, #0xe8]\n"
+    "fmla z9.s, p3/M, z7.s, z10.s\n"
+    "fmla z24.s, p3/M, z5.s, z10.s\n"
+    "fmla z11.s, p3/M, z4.s, z10.s\n"
+    "fmla z31.s, p3/M, z2.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x24, x16, LSL #2]\n"
+    "ldr x25, [x8, #0xf0]\n"
+    "fmla z26.s, p3/M, z2.s, z29.s\n"
+    "fmla z22.s, p3/M, z1.s, z29.s\n"
+    "fmla z27.s, p3/M, z0.s, z29.s\n"
+    "fmla z14.s, p3/M, z7.s, z25.s\n"
+    "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "ldr x24, [x8, #0xf8]\n"
+    "fmla z20.s, p3/M, z6.s, z25.s\n"
+    "fmla z23.s, p3/M, z4.s, z25.s\n"
+    "fmla z30.s, p3/M, z3.s, z25.s\n"
+    "fmla z15.s, p3/M, z1.s, z25.s\n"
+    "fmla z18.s, p3/M, z0.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "fmla z13.s, p3/M, z4.s, z25.s\n"
+    "ldr x23, [x8, #0x100]\n"
+    "fmla z21.s, p3/M, z2.s, z25.s\n"
+    "fmla z22.s, p3/M, z2.s, z10.s\n"
+    "fmla z27.s, p3/M, z1.s, z10.s\n"
+    "fmla z9.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z12.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "ldr x22, [x8, #0x108]\n"
+    "fmla z26.s, p3/M, z6.s, z29.s\n"
+    "fmla z14.s, p3/M, z3.s, z29.s\n"
+    "fmla z23.s, p3/M, z0.s, z29.s\n"
+    "fmla z24.s, p3/M, z8.s, z25.s\n"
+    "ld1w { z10.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "ldr x21, [x8, #0x110]\n"
+    "fmla z11.s, p3/M, z7.s, z25.s\n"
+    "fmla z31.s, p3/M, z5.s, z25.s\n"
+    "fmla z28.s, p3/M, z1.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [x28, x16, LSL #2]\n"
+    "fmla z13.s, p3/M, z2.s, z12.s\n"
+    "ldr x20, [x8, #0x118]\n"
+    "fmla z15.s, p3/M, z0.s, z10.s\n"
+    "fmla z18.s, p3/M, z4.s, z25.s\n"
+    "fmla z21.s, p3/M, z3.s, z25.s\n"
+    "fmla z9.s, p3/M, z8.s, z12.s\n"
+    "fmla z11.s, p3/M, z5.s, z12.s\n"
+    "fmla z14.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z12.s }, p2/Z, [x27, x16, LSL #2]\n"
+    "fmla z23.s, p3/M, z3.s, z10.s\n"
+    "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
+    "fmla z30.s, p3/M, z7.s, z25.s\n"
+    "fmla z31.s, p3/M, z6.s, z25.s\n"
+    "fmla z15.s, p3/M, z5.s, z25.s\n"
+    "fmla z13.s, p3/M, z5.s, z12.s\n"
+    "fmla z28.s, p3/M, z2.s, z12.s\n"
+    "fmla z18.s, p3/M, z7.s, z29.s\n"
+    "fmla z21.s, p3/M, z6.s, z29.s\n"
+    "fmla z23.s, p3/M, z8.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
+    "fmla z15.s, p3/M, z8.s, z29.s\n"
+    "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z25.s\n"
+    "fmla z31.s, p3/M, z7.s, z25.s\n"
+    "fmla z13.s, p3/M, z6.s, z25.s\n"
+    "fmla z18.s, p3/M, z5.s, z25.s\n"
+    "fmla z21.s, p3/M, z4.s, z25.s\n"
+    "fmla z28.s, p3/M, z3.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "ldp x27, x26, [x8, #0x0]\n"
+    "fmla z11.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x16, LSL #2]\n"
+    "fmla z26.s, p3/M, z4.s, z29.s\n"
+    "fmax z26.s, p3/M, z26.s, z16.s\n"
+    "fmla z22.s, p3/M, z3.s, z29.s\n"
+    "fmla z27.s, p3/M, z5.s, z25.s\n"
+    "fmax z22.s, p3/M, z22.s, z16.s\n"
+    "fmax z27.s, p3/M, z27.s, z16.s\n"
+    "fmla z9.s, p3/M, z4.s, z25.s\n"
+    "fmla z18.s, p3/M, z8.s, z12.s\n"
+    "fmax z9.s, p3/M, z9.s, z16.s\n"
+    "fmin z26.s, p3/M, z26.s, z19.s\n"
+    "fmla z21.s, p3/M, z7.s, z12.s\n"
+    "fmla z28.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "fmin z22.s, p3/M, z22.s, z19.s\n"
+    "fmla z14.s, p3/M, z1.s, z29.s\n"
+    "fmla z20.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "fmin z27.s, p3/M, z27.s, z19.s\n"
+    "fmla z24.s, p3/M, z2.s, z25.s\n"
+    "fmla z11.s, p3/M, z1.s, z25.s\n"
+    "fmin z9.s, p3/M, z9.s, z19.s\n"
+    "fmax z14.s, p3/M, z14.s, z16.s\n"
+    "fmla z23.s, p3/M, z7.s, z10.s\n"
+    "fmla z30.s, p3/M, z6.s, z10.s\n"
+    "fmax z20.s, p3/M, z20.s, z16.s\n"
+    "fmax z24.s, p3/M, z24.s, z16.s\n"
+    "fmla z31.s, p3/M, z8.s, z12.s\n"
+    "fmla z13.s, p3/M, z7.s, z12.s\n"
+    "fmax z11.s, p3/M, z11.s, z16.s\n"
+    "st1w { z26.s }, p1, [x12, x15, LSL #2]\n"
+    "st1w { z22.s }, p1, [x11, x15, LSL #2]\n"
+    "ldr x23, [x14, #0x20]\n"
+    "ldr x22, [x14, #0x28]\n"
+    "fmla z15.s, p3/M, z4.s, z10.s\n"
+    "st1w { z27.s }, p1, [x10, x15, LSL #2]\n"
+    "ldr x21, [x14, #0x30]\n"
+    "fmla z18.s, p3/M, z3.s, z10.s\n"
+    "fmla z21.s, p3/M, z5.s, z12.s\n"
+    "st1w { z9.s }, p1, [x9, x15, LSL #2]\n"
+    "ldr x20, [x14, #0x38]\n"
+    "fmla z28.s, p3/M, z4.s, z12.s\n"
+    "ldp x25, x24, [x8, #0x10]\n"
+    "fmin z14.s, p3/M, z14.s, z19.s\n"
+    "fmin z20.s, p3/M, z20.s, z19.s\n"
+    "st1w { z14.s }, p1, [x23, x15, LSL #2]\n"
+    "ldr x23, [x14, #0x40]\n"
+    "fmin z24.s, p3/M, z24.s, z19.s\n"
+    "fmin z11.s, p3/M, z11.s, z19.s\n"
+    "st1w { z20.s }, p1, [x22, x15, LSL #2]\n"
+    "ldr x22, [x14, #0x48]\n"
+    "fmax z23.s, p3/M, z23.s, z16.s\n"
+    "fmax z30.s, p3/M, z30.s, z16.s\n"
+    "st1w { z24.s }, p1, [x21, x15, LSL #2]\n"
+    "ldr x21, [x14, #0x50]\n"
+    "fmax z31.s, p3/M, z31.s, z16.s\n"
+    "fmax z13.s, p3/M, z13.s, z16.s\n"
+    "st1w { z11.s }, p1, [x20, x15, LSL #2]\n"
+    "ldr x20, [x14, #0x58]\n"
+    "incw x16\n"
+    "ld1w { z9.s }, p0/Z, [x27, x17, LSL #2]\n"
+    "ld1w { z10.s }, p0/Z, [x26, x17, LSL #2]\n"
+    "fmin z23.s, p3/M, z23.s, z19.s\n"
+    "ld1w { z11.s }, p0/Z, [x25, x17, LSL #2]\n"
+    "ld1w { z12.s }, p0/Z, [x24, x17, LSL #2]\n"
+    "incw x17\n"
+    "fmin z30.s, p3/M, z30.s, z19.s\n"
+    "fmin z31.s, p3/M, z31.s, z19.s\n"
+    "fmin z13.s, p3/M, z13.s, z19.s\n"
+    "st1w { z23.s }, p1, [x23, x15, LSL #2]\n"
+    "ldr x23, [x14, #0x60]\n"
+    "fmax z15.s, p3/M, z15.s, z16.s\n"
+    "fmax z18.s, p3/M, z18.s, z16.s\n"
+    "st1w { z30.s }, p1, [x22, x15, LSL #2]\n"
+    "ldr x22, [x14, #0x68]\n"
+    "fmax z21.s, p3/M, z21.s, z16.s\n"
+    "fmax z28.s, p3/M, z28.s, z16.s\n"
+    "st1w { z31.s }, p1, [x21, x15, LSL #2]\n"
+    "ldr x21, [x14, #0x70]\n"
+    "st1w { z13.s }, p1, [x20, x15, LSL #2]\n"
+    "ldr x20, [x14, #0x78]\n"
+    "ld1w { z0.s }, p3/Z, [x7, #1, MUL VL]\n"
+    "whilelt p2.s, x16, %x[n_channels]\n"
+    "ld1w { z1.s }, p3/Z, [x7, #2, MUL VL]\n"
+    "ld1w { z2.s }, p3/Z, [x7, #3, MUL VL]\n"
+    "cmp x17, %x[n_channels]\n"
+    "fmin z15.s, p3/M, z15.s, z19.s\n"
+    "ld1w { z3.s }, p3/Z, [x7, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x7, #5, MUL VL]\n"
+    "fmin z18.s, p3/M, z18.s, z19.s\n"
+    "fmin z21.s, p3/M, z21.s, z19.s\n"
+    "ld1w { z5.s }, p3/Z, [x7, #6, MUL VL]\n"
+    "ld1w { z6.s }, p3/Z, [x7, #7, MUL VL]\n"
+    "addvl x7, x7, #16\n"
+    "fmin z28.s, p3/M, z28.s, z19.s\n"
+    "st1w { z15.s }, p1, [x23, x15, LSL #2]\n"
+    "ld1w { z7.s }, p3/Z, [x7, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x7, #-7, MUL VL]\n"
+    "addvl x7, x7, #-6\n"
+    "st1w { z18.s }, p1, [x22, x15, LSL #2]\n"
+    "st1w { z21.s }, p1, [x21, x15, LSL #2]\n"
+    "st1w { z28.s }, p1, [x20, x15, LSL #2]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z14, z17\n fmla z14.s, p3/M, z4.s, z9.s\n"
+    "movprfx z18, z17\n fmla z18.s, p3/M, z8.s, z9.s\n"
+    "ldr x27, [x8, #0x20]\n"
+    "ldr x24, [x8, #0x30]\n"
+    "movprfx z15, z17\n fmla z15.s, p3/M, z3.s, z9.s\n"
+    "movprfx z30, z17\n fmla z30.s, p3/M, z1.s, z9.s\n"
+    "ldr x23, [x8, #0x28]\n"
+    "ldr x22, [x8, #0x38]\n"
+    "movprfx z20, z17\n fmla z20.s, p3/M, z0.s, z9.s\n"
+    "movprfx z13, z17\n fmla z13.s, p3/M, z7.s, z9.s\n"
+    "ldr x26, [x8, #0x40]\n"
+    "ldr x21, [x8, #0x48]\n"
+    "movprfx z22, z17\n fmla z22.s, p3/M, z6.s, z9.s\n"
+    "fmla z14.s, p3/M, z5.s, z12.s\n"
+    "ldr x25, [x8, #0x50]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "movprfx z27, z17\n fmla z27.s, p3/M, z5.s, z9.s\n"
+    "movprfx z31, z17\n fmla z31.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z23.s }, p2/Z, [x24, x16, LSL #2]\n"
+    "ldr x13, [x8, #0x70]\n"
+    "fmla z18.s, p3/M, z0.s, z10.s\n"
+    "movprfx z9, z17\n fmla z9.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z21.s }, p2/Z, [x27, x16, LSL #2]\n"
+    "ld1w { z25.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "fmla z15.s, p3/M, z4.s, z12.s\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "ldr x24, [x8, #0x60]\n"
+    "ldr x23, [x8, #0x68]\n"
+    "fmla z20.s, p3/M, z1.s, z12.s\n"
+    "fmla z13.s, p3/M, z8.s, z12.s\n"
+    "incw x15\n"
+    "mov p0.b, p2.b\n"
+    "fmla z22.s, p3/M, z7.s, z12.s\n"
+    "movprfx z28, z17\n fmla z28.s, p3/M, z6.s, z21.s\n"
+    "ld1w { z29.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "ldr x28, [x8, #0x88]\n"
+    "fmla z14.s, p3/M, z7.s, z23.s\n"
+    "fmla z9.s, p3/M, z6.s, z12.s\n"
+    "ldr x12, [x14, #0x0]\n"
+    "ldr x11, [x14, #0x8]\n"
+    "movprfx z11, z17\n fmla z11.s, p3/M, z3.s, z12.s\n"
+    "movprfx z10, z17\n fmla z10.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "ldr x22, [x8, #0x78]\n"
+    "movprfx z26, z17\n fmla z26.s, p3/M, z8.s, z25.s\n"
+    "fmla z15.s, p3/M, z6.s, z23.s\n"
+    "ld1w { z21.s }, p2/Z, [x26, x16, LSL #2]\n"
+    "ldr x21, [x8, #0x80]\n"
+    "fmla z30.s, p3/M, z4.s, z23.s\n"
+    "fmla z20.s, p3/M, z3.s, z23.s\n"
+    "ldr x10, [x14, #0x10]\n"
+    "ldr x9, [x14, #0x18]\n"
+    "movprfx z25, z17\n fmla z25.s, p3/M, z1.s, z23.s\n"
+    "movprfx z24, z17\n fmla z24.s, p3/M, z0.s, z23.s\n"
+    "fmla z27.s, p3/M, z8.s, z23.s\n"
+    "fmla z31.s, p3/M, z5.s, z23.s\n"
+    "fmla z28.s, p3/M, z2.s, z23.s\n"
+    "fmla z18.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z23.s }, p2/Z, [x25, x16, LSL #2]\n"
+    "ldr x27, [x8, #0x90]\n"
+    "fmla z13.s, p3/M, z0.s, z12.s\n"
+    "fmla z22.s, p3/M, z2.s, z21.s\n"
+    "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "ldr x20, [x8, #0x98]\n"
+    "fmla z14.s, p3/M, z8.s, z29.s\n"
+    "fmla z9.s, p3/M, z1.s, z21.s\n"
+    "ld1w { z21.s }, p2/Z, [x24, x16, LSL #2]\n"
+    "ldr x26, [x8, #0xa0]\n"
+    "fmla z15.s, p3/M, z7.s, z29.s\n"
+    "fmla z11.s, p3/M, z6.s, z29.s\n"
+    "fmla z30.s, p3/M, z5.s, z29.s\n"
+    "fmla z20.s, p3/M, z4.s, z29.s\n"
+    "fmla z10.s, p3/M, z3.s, z29.s\n"
+    "fmla z25.s, p3/M, z2.s, z29.s\n"
+    "fmla z24.s, p3/M, z1.s, z29.s\n"
+    "fmla z26.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "ldr x25, [x8, #0xa8]\n"
+    "fmla z18.s, p3/M, z3.s, z23.s\n"
+    "fmla z27.s, p3/M, z0.s, z23.s\n"
+    "fmla z31.s, p3/M, z6.s, z21.s\n"
+    "fmla z28.s, p3/M, z3.s, z21.s\n"
+    "ld1w { z21.s }, p2/Z, [x13, x16, LSL #2]\n"
+    "ldr x24, [x8, #0xb0]\n"
+    "fmla z13.s, p3/M, z4.s, z29.s\n"
+    "fmla z22.s, p3/M, z3.s, z29.s\n"
+    "fmla z14.s, p3/M, z1.s, z29.s\n"
+    "fmla z9.s, p3/M, z5.s, z12.s\n"
+    "fmla z11.s, p3/M, z2.s, z12.s\n"
+    "fmla z15.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z17.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "ldr x23, [x8, #0xb8]\n"
+    "fmla z10.s, p3/M, z8.s, z21.s\n"
+    "fmla z26.s, p3/M, z5.s, z21.s\n"
+    "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "ldr x22, [x8, #0xc0]\n"
+    "fmla z18.s, p3/M, z5.s, z29.s\n"
+    "fmla z27.s, p3/M, z2.s, z29.s\n"
+    "ld1w { z21.s }, p2/Z, [x28, x16, LSL #2]\n"
+    "ldr x21, [x8, #0xc8]\n"
+    "fmla z13.s, p3/M, z5.s, z17.s\n"
+    "fmla z22.s, p3/M, z4.s, z17.s\n"
+    "fmla z14.s, p3/M, z2.s, z17.s\n"
+    "fmla z9.s, p3/M, z3.s, z17.s\n"
+    "fmla z15.s, p3/M, z1.s, z17.s\n"
+    "fmla z11.s, p3/M, z0.s, z17.s\n"
+    "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "ldr x28, [x8, #0xd8]\n"
+    "fmla z28.s, p3/M, z7.s, z23.s\n"
+    "fmla z25.s, p3/M, z6.s, z23.s\n"
+    "ld1w { z23.s }, p2/Z, [x27, x16, LSL #2]\n"
+    "ldr x20, [x8, #0xd0]\n"
+    "fmla z18.s, p3/M, z7.s, z21.s\n"
+    "fmla z13.s, p3/M, z6.s, z21.s\n"
+    "fmla z27.s, p3/M, z4.s, z21.s\n"
+    "fmla z14.s, p3/M, z3.s, z21.s\n"
+    "fmla z31.s, p3/M, z1.s, z21.s\n"
+    "fmla z30.s, p3/M, z0.s, z21.s\n"
+    "ld1w { z21.s }, p2/Z, [x26, x16, LSL #2]\n"
+    "ldr x27, [x8, #0xe0]\n"
+    "fmla z22.s, p3/M, z8.s, z29.s\n"
+    "fmla z24.s, p3/M, z8.s, z23.s\n"
+    "fmla z26.s, p3/M, z7.s, z23.s\n"
+    "ld1w { z23.s }, p2/Z, [x25, x16, LSL #2]\n"
+    "fmla z10.s, p3/M, z1.s, z29.s\n"
+    "ldr x26, [x8, #0xe8]\n"
+    "fmla z9.s, p3/M, z7.s, z29.s\n"
+    "fmla z15.s, p3/M, z5.s, z29.s\n"
+    "fmla z11.s, p3/M, z4.s, z29.s\n"
+    "fmla z20.s, p3/M, z2.s, z29.s\n"
+    "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
+    "ldr x25, [x8, #0xf0]\n"
+    "fmla z18.s, p3/M, z2.s, z21.s\n"
+    "fmla z13.s, p3/M, z1.s, z21.s\n"
+    "fmla z22.s, p3/M, z0.s, z21.s\n"
+    "fmla z27.s, p3/M, z7.s, z23.s\n"
+    "ld1w { z21.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "ldr x24, [x8, #0xf8]\n"
+    "fmla z14.s, p3/M, z6.s, z23.s\n"
+    "fmla z31.s, p3/M, z4.s, z23.s\n"
+    "fmla z30.s, p3/M, z3.s, z23.s\n"
+    "fmla z28.s, p3/M, z1.s, z23.s\n"
+    "fmla z25.s, p3/M, z0.s, z23.s\n"
+    "ld1w { z17.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "fmla z10.s, p3/M, z4.s, z17.s\n"
+    "ldr x23, [x8, #0x100]\n"
+    "fmla z24.s, p3/M, z2.s, z17.s\n"
+    "fmla z13.s, p3/M, z2.s, z29.s\n"
+    "fmla z22.s, p3/M, z1.s, z29.s\n"
+    "fmla z9.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "ldr x22, [x8, #0x108]\n"
+    "fmla z18.s, p3/M, z6.s, z21.s\n"
+    "fmla z27.s, p3/M, z3.s, z21.s\n"
+    "fmla z31.s, p3/M, z0.s, z21.s\n"
+    "fmla z15.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "ldr x21, [x8, #0x110]\n"
+    "fmla z11.s, p3/M, z7.s, z17.s\n"
+    "fmla z20.s, p3/M, z5.s, z17.s\n"
+    "fmla z26.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z21.s }, p2/Z, [x28, x16, LSL #2]\n"
+    "fmla z10.s, p3/M, z2.s, z23.s\n"
+    "ldr x20, [x8, #0x118]\n"
+    "fmla z28.s, p3/M, z0.s, z29.s\n"
+    "fmla z25.s, p3/M, z4.s, z21.s\n"
+    "fmla z24.s, p3/M, z3.s, z21.s\n"
+    "fmla z9.s, p3/M, z8.s, z23.s\n"
+    "fmla z11.s, p3/M, z5.s, z23.s\n"
+    "fmla z27.s, p3/M, z6.s, z29.s\n"
+    "ld1w { z23.s }, p2/Z, [x27, x16, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z29.s\n"
+    "ld1w { z17.s }, p2/Z, [x26, x16, LSL #2]\n"
+    "fmla z30.s, p3/M, z7.s, z21.s\n"
+    "fmla z20.s, p3/M, z6.s, z21.s\n"
+    "fmla z28.s, p3/M, z5.s, z21.s\n"
+    "fmla z10.s, p3/M, z5.s, z23.s\n"
+    "fmla z26.s, p3/M, z2.s, z23.s\n"
+    "fmla z25.s, p3/M, z7.s, z17.s\n"
+    "fmla z24.s, p3/M, z6.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z21.s\n"
+    "ld1w { z21.s }, p2/Z, [x25, x16, LSL #2]\n"
+    "fmla z28.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z12.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z21.s\n"
+    "fmla z20.s, p3/M, z7.s, z21.s\n"
+    "fmla z10.s, p3/M, z6.s, z21.s\n"
+    "fmla z25.s, p3/M, z5.s, z21.s\n"
+    "fmla z24.s, p3/M, z4.s, z21.s\n"
+    "fmla z26.s, p3/M, z3.s, z21.s\n"
+    "ld1w { z21.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "fmla z11.s, p3/M, z8.s, z23.s\n"
+    "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
+    "fmla z18.s, p3/M, z4.s, z12.s\n"
+    "fmax z18.s, p3/M, z18.s, z16.s\n"
+    "fmla z13.s, p3/M, z3.s, z12.s\n"
+    "fmla z22.s, p3/M, z5.s, z21.s\n"
+    "fmax z13.s, p3/M, z13.s, z16.s\n"
+    "fmax z22.s, p3/M, z22.s, z16.s\n"
+    "fmla z9.s, p3/M, z4.s, z21.s\n"
+    "fmla z25.s, p3/M, z8.s, z29.s\n"
+    "fmax z9.s, p3/M, z9.s, z16.s\n"
+    "fmin z18.s, p3/M, z18.s, z19.s\n"
+    "fmla z24.s, p3/M, z7.s, z29.s\n"
+    "fmla z26.s, p3/M, z6.s, z29.s\n"
+    "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "fmin z13.s, p3/M, z13.s, z19.s\n"
+    "fmla z27.s, p3/M, z1.s, z12.s\n"
+    "fmla z14.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "fmin z22.s, p3/M, z22.s, z19.s\n"
+    "fmla z15.s, p3/M, z2.s, z21.s\n"
+    "fmla z11.s, p3/M, z1.s, z21.s\n"
+    "fmin z9.s, p3/M, z9.s, z19.s\n"
+    "fmax z27.s, p3/M, z27.s, z16.s\n"
+    "fmla z31.s, p3/M, z7.s, z23.s\n"
+    "fmla z30.s, p3/M, z6.s, z23.s\n"
+    "fmax z14.s, p3/M, z14.s, z16.s\n"
+    "fmax z15.s, p3/M, z15.s, z16.s\n"
+    "fmla z20.s, p3/M, z8.s, z29.s\n"
+    "fmla z10.s, p3/M, z7.s, z29.s\n"
+    "fmax z11.s, p3/M, z11.s, z16.s\n"
+    "st1w { z18.s }, p0, [x12, x15, LSL #2]\n"
+    "st1w { z13.s }, p0, [x11, x15, LSL #2]\n"
+    "ldr x23, [x14, #0x20]\n"
+    "ldr x22, [x14, #0x28]\n"
+    "fmla z28.s, p3/M, z4.s, z23.s\n"
+    "st1w { z22.s }, p0, [x10, x15, LSL #2]\n"
+    "ldr x21, [x14, #0x30]\n"
+    "fmla z25.s, p3/M, z3.s, z23.s\n"
+    "fmla z24.s, p3/M, z5.s, z29.s\n"
+    "st1w { z9.s }, p0, [x9, x15, LSL #2]\n"
+    "ldr x20, [x14, #0x38]\n"
+    "fmla z26.s, p3/M, z4.s, z29.s\n"
+    "fmin z27.s, p3/M, z27.s, z19.s\n"
+    "fmin z14.s, p3/M, z14.s, z19.s\n"
+    "fmin z15.s, p3/M, z15.s, z19.s\n"
+    "st1w { z27.s }, p0, [x23, x15, LSL #2]\n"
+    "ldr x23, [x14, #0x40]\n"
+    "fmin z11.s, p3/M, z11.s, z19.s\n"
+    "fmax z31.s, p3/M, z31.s, z16.s\n"
+    "st1w { z14.s }, p0, [x22, x15, LSL #2]\n"
+    "ldr x22, [x14, #0x48]\n"
+    "fmax z30.s, p3/M, z30.s, z16.s\n"
+    "fmax z20.s, p3/M, z20.s, z16.s\n"
+    "st1w { z15.s }, p0, [x21, x15, LSL #2]\n"
+    "ldr x21, [x14, #0x50]\n"
+    "fmax z10.s, p3/M, z10.s, z16.s\n"
+    "st1w { z11.s }, p0, [x20, x15, LSL #2]\n"
+    "ldr x20, [x14, #0x58]\n"
+    "fmin z31.s, p3/M, z31.s, z19.s\n"
+    "fmin z30.s, p3/M, z30.s, z19.s\n"
+    "fmin z20.s, p3/M, z20.s, z19.s\n"
+    "st1w { z31.s }, p0, [x23, x15, LSL #2]\n"
+    "ldr x23, [x14, #0x60]\n"
+    "fmin z10.s, p3/M, z10.s, z19.s\n"
+    "fmax z28.s, p3/M, z28.s, z16.s\n"
+    "st1w { z30.s }, p0, [x22, x15, LSL #2]\n"
+    "ldr x22, [x14, #0x68]\n"
+    "fmax z25.s, p3/M, z25.s, z16.s\n"
+    "fmax z24.s, p3/M, z24.s, z16.s\n"
+    "st1w { z20.s }, p0, [x21, x15, LSL #2]\n"
+    "ldr x21, [x14, #0x70]\n"
+    "fmax z26.s, p3/M, z26.s, z16.s\n"
+    "st1w { z10.s }, p0, [x20, x15, LSL #2]\n"
+    "ldr x20, [x14, #0x78]\n"
+    "fmin z28.s, p3/M, z28.s, z19.s\n"
+    "fmin z25.s, p3/M, z25.s, z19.s\n"
+    "fmin z24.s, p3/M, z24.s, z19.s\n"
+    "st1w { z28.s }, p0, [x23, x15, LSL #2]\n"
+    "fmin z26.s, p3/M, z26.s, z19.s\n"
+    "st1w { z25.s }, p0, [x22, x15, LSL #2]\n"
+    "st1w { z24.s }, p0, [x21, x15, LSL #2]\n"
+    "st1w { z26.s }, p0, [x20, x15, LSL #2]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..75d62007ab
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..e6090fda94
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x11, #0x0\n"
+    "mov x16, #0x0\n"
+    "1:"  // Tile loop
+    "str x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x4\n"
+    "mov x24, #0x2\n"
+    "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x11, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x16, x15, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "cntw x13\n"
+    "mul x20, x11, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x11, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x10, x15, x15\n"
+    "mul x22, x22, x25\n"  // offset *= kernel_stride * output_size
+    "add x12, x12, x22, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "ldr x9, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x28, x12, x23, LSL #2\n"
+    "madd x20, x16, x14, x20\n"  // offset += tile_j * ld_output_col
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z30.s }, p3/Z, [x11]\n"
+    "ld1w { z0.s }, p3/Z, [x11, #1, MUL VL]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "ld1w { z1.s }, p3/Z, [x11, #2, MUL VL]\n"
+    "ld1w { z2.s }, p3/Z, [x11, #3, MUL VL]\n"
+    "add x27, x28, x23, LSL #2\n"
+    "ld1w { z3.s }, p3/Z, [x11, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x11, #5, MUL VL]\n"
+    "add x26, x10, x15\n"
+    "add x25, x27, x23, LSL #2\n"
+    "ld1w { z5.s }, p3/Z, [x11, #6, MUL VL]\n"
+    "ld1w { z6.s }, p3/Z, [x11, #7, MUL VL]\n"
+    "addvl x11, x11, #16\n"
+    "add x24, x26, x15\n"
+    "add x9, x9, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "cmp x13, %x[n_channels]\n"
+    "ld1rw { z29.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z28.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x23, x25, x23, LSL #2\n"
+    "add x22, x9, x21, LSL #2\n"
+    "ld1w { z7.s }, p3/Z, [x11, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x11, #-7, MUL VL]\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x13\n"
+    "ld1w { z9.s }, p2/Z, [x27, x10, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x12]\n"
+    "ld1w { z11.s }, p2/Z, [x12, x15, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x12, x26, LSL #2]\n"
+    "addvl x11, x11, #-6\n"
+    "ld1w { z13.s }, p2/Z, [x12, x24, LSL #2]\n"
+    "ld1w { z14.s }, p2/Z, [x28]\n"
+    "ld1w { z15.s }, p2/Z, [x28, x15, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x12, x10, LSL #2]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z27, z30\n fmla z27.s, p3/M, z8.s, z9.s\n"
+    "movprfx z26, z30\n fmla z26.s, p3/M, z6.s, z9.s\n"
+    "whilelt p1.s, x13, %x[n_channels]\n"
+    "incw x21\n"
+    "fmla z27.s, p3/M, z0.s, z10.s\n"
+    "fmla z26.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z20.s }, p2/Z, [x28, x24, LSL #2]\n"
+    "incw x13\n"
+    "fmla z27.s, p3/M, z1.s, z11.s\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x28, x26, LSL #2]\n"
+    "ld1w { z19.s }, p2/Z, [x28, x10, LSL #2]\n"
+    "fmla z27.s, p3/M, z3.s, z14.s\n"
+    "fmla z26.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x25]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z27.s, p3/M, z4.s, z15.s\n"
+    "fmla z26.s, p3/M, z4.s, z17.s\n"
+    "ld1w { z25.s }, p2/Z, [x27]\n"
+    "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z2.s, z16.s\n"
+    "fmla z26.s, p3/M, z5.s, z20.s\n"
+    "ld1w { z24.s }, p2/Z, [x27, x26, LSL #2]\n"
+    "ld1w { z23.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "movprfx z22, z30\n fmla z22.s, p3/M, z2.s, z9.s\n"
+    "movprfx z21, z30\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "addvl x12, x12, #1\n"
+    "addvl x28, x28, #1\n"
+    "fmla z27.s, p3/M, z5.s, z19.s\n"
+    "fmla z26.s, p3/M, z3.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x26, LSL #2]\n"
+    "ld1w { z30.s }, p3/Z, [x11]\n"
+    "fmla z22.s, p3/M, z3.s, z18.s\n"
+    "fmla z21.s, p3/M, z4.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x24, LSL #2]\n"
+    "ld1w { z20.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z22.s, p3/M, z0.s, z25.s\n"
+    "fmla z21.s, p3/M, z1.s, z24.s\n"
+    "ld1w { z0.s }, p3/Z, [x11, #1, MUL VL]\n"
+    "incw x20\n"
+    "fmla z22.s, p3/M, z4.s, z17.s\n"
+    "fmla z21.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x27, x24, LSL #2]\n"
+    "ld1w { z18.s }, p2/Z, [x23, x26, LSL #2]\n"
+    "fmla z27.s, p3/M, z6.s, z25.s\n"
+    "fmla z22.s, p3/M, z1.s, z23.s\n"
+    "ld1w { z17.s }, p2/Z, [x23]\n"
+    "addvl x27, x27, #1\n"
+    "fmla z21.s, p3/M, z2.s, z19.s\n"
+    "fmla z27.s, p3/M, z7.s, z23.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x10, LSL #2]\n"
+    "fmax z27.s, p3/M, z27.s, z29.s\n"
+    "fmla z22.s, p3/M, z6.s, z17.s\n"
+    "fmla z21.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x10, LSL #2]\n"
+    "ld1w { z1.s }, p3/Z, [x11, #2, MUL VL]\n"
+    "fmla z22.s, p3/M, z7.s, z20.s\n"
+    "fmla z21.s, p3/M, z7.s, z18.s\n"
+    "ld1w { z2.s }, p3/Z, [x11, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x11, #4, MUL VL]\n"
+    "fmla z26.s, p3/M, z7.s, z24.s\n"
+    "fmla z22.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z4.s }, p3/Z, [x11, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x11, #6, MUL VL]\n"
+    "fmla z21.s, p3/M, z6.s, z17.s\n"
+    "fmla z26.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+    "fmax z26.s, p3/M, z26.s, z29.s\n"
+    "fmla z22.s, p3/M, z8.s, z17.s\n"
+    "fmla z21.s, p3/M, z8.s, z16.s\n"
+    "fmax z22.s, p3/M, z22.s, z29.s\n"
+    "fmax z21.s, p3/M, z21.s, z29.s\n"
+    "ld1w { z6.s }, p3/Z, [x11, #7, MUL VL]\n"
+    "addvl x11, x11, #16\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "ld1w { z9.s }, p1/Z, [x27, x10, LSL #2]\n"
+    "cmp x13, %x[n_channels]\n"
+    "fmin z27.s, p3/M, z27.s, z28.s\n"
+    "ld1w { z10.s }, p1/Z, [x12]\n"
+    "ld1w { z11.s }, p1/Z, [x12, x15, LSL #2]\n"
+    "fmin z26.s, p3/M, z26.s, z28.s\n"
+    "fmin z22.s, p3/M, z22.s, z28.s\n"
+    "ld1w { z12.s }, p1/Z, [x12, x26, LSL #2]\n"
+    "ld1w { z13.s }, p1/Z, [x12, x24, LSL #2]\n"
+    "fmin z21.s, p3/M, z21.s, z28.s\n"
+    "addvl x25, x25, #1\n"
+    "ld1w { z14.s }, p1/Z, [x28]\n"
+    "ld1w { z15.s }, p1/Z, [x28, x15, LSL #2]\n"
+    "addvl x23, x23, #1\n"
+    "ld1w { z16.s }, p1/Z, [x12, x10, LSL #2]\n"
+    "st1w { z27.s }, p0, [x9]\n"
+    "ld1w { z7.s }, p3/Z, [x11, #-8, MUL VL]\n"
+    "st1w { z26.s }, p0, [x9, x14, LSL #2]\n"
+    "addvl x9, x9, #1\n"
+    "ld1w { z8.s }, p3/Z, [x11, #-7, MUL VL]\n"
+    "addvl x11, x11, #-6\n"
+    "st1w { z22.s }, p0, [x22]\n"
+    "st1w { z21.s }, p0, [x22, x14, LSL #2]\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z27, z30\n fmla z27.s, p3/M, z8.s, z9.s\n"
+    "movprfx z26, z30\n fmla z26.s, p3/M, z6.s, z9.s\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "fmla z27.s, p3/M, z0.s, z10.s\n"
+    "fmla z26.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z20.s }, p2/Z, [x28, x24, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "fmla z27.s, p3/M, z1.s, z11.s\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x28, x26, LSL #2]\n"
+    "ld1w { z19.s }, p2/Z, [x28, x10, LSL #2]\n"
+    "fmla z27.s, p3/M, z3.s, z14.s\n"
+    "fmla z26.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x25]\n"
+    "add x16, x16, #0x1\n"
+    "fmla z27.s, p3/M, z4.s, z15.s\n"
+    "fmla z26.s, p3/M, z4.s, z17.s\n"
+    "ld1w { z25.s }, p2/Z, [x27]\n"
+    "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z2.s, z16.s\n"
+    "fmla z26.s, p3/M, z5.s, z20.s\n"
+    "ld1w { z24.s }, p2/Z, [x27, x26, LSL #2]\n"
+    "ld1w { z23.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "movprfx z22, z30\n fmla z22.s, p3/M, z2.s, z9.s\n"
+    "movprfx z21, z30\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "cmp x16, x20\n"
+    "add x21, x11, #0x1\n"
+    "fmla z27.s, p3/M, z5.s, z19.s\n"
+    "fmla z26.s, p3/M, z3.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x26, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z22.s, p3/M, z3.s, z18.s\n"
+    "fmla z21.s, p3/M, z4.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x24, LSL #2]\n"
+    "ld1w { z20.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z22.s, p3/M, z0.s, z25.s\n"
+    "fmla z21.s, p3/M, z1.s, z24.s\n"
+    "csel x11, x11, x21, LT\n"
+    "mov p0.b, p2.b\n"
+    "fmla z22.s, p3/M, z4.s, z17.s\n"
+    "fmla z21.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x27, x24, LSL #2]\n"
+    "ld1w { z18.s }, p2/Z, [x23, x26, LSL #2]\n"
+    "fmla z27.s, p3/M, z6.s, z25.s\n"
+    "fmla z22.s, p3/M, z1.s, z23.s\n"
+    "ld1w { z17.s }, p2/Z, [x23]\n"
+    "csel x16, x16, XZR, LT\n"
+    "fmla z21.s, p3/M, z2.s, z19.s\n"
+    "fmla z27.s, p3/M, z7.s, z23.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x10, LSL #2]\n"
+    "fmax z27.s, p3/M, z27.s, z29.s\n"
+    "fmla z22.s, p3/M, z6.s, z17.s\n"
+    "fmla z21.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x10, LSL #2]\n"
+    "cmp x11, x20\n"
+    "fmla z22.s, p3/M, z7.s, z20.s\n"
+    "fmla z21.s, p3/M, z7.s, z18.s\n"
+    "fmin z27.s, p3/M, z27.s, z28.s\n"
+    "st1w { z27.s }, p0, [x9]\n"
+    "fmla z26.s, p3/M, z7.s, z24.s\n"
+    "fmla z22.s, p3/M, z5.s, z16.s\n"
+    "fmla z21.s, p3/M, z6.s, z17.s\n"
+    "fmla z26.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+    "fmax z26.s, p3/M, z26.s, z29.s\n"
+    "fmla z22.s, p3/M, z8.s, z17.s\n"
+    "fmla z21.s, p3/M, z8.s, z16.s\n"
+    "fmax z22.s, p3/M, z22.s, z29.s\n"
+    "fmax z21.s, p3/M, z21.s, z29.s\n"
+    "fmin z26.s, p3/M, z26.s, z28.s\n"
+    "fmin z22.s, p3/M, z22.s, z28.s\n"
+    "st1w { z26.s }, p0, [x9, x14, LSL #2]\n"
+    "fmin z21.s, p3/M, z21.s, z28.s\n"
+    "st1w { z22.s }, p0, [x22]\n"
+    "st1w { z21.s }, p0, [x22, x14, LSL #2]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..98427701fa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[25];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[1];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[2];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[7];
+      inptrs[11] = input_ptrs[15];
+      inptrs[12] = input_ptrs[10];
+      inptrs[13] = input_ptrs[16];
+      inptrs[14] = input_ptrs[11];
+      inptrs[15] = input_ptrs[18];
+      inptrs[16] = input_ptrs[13];
+      inptrs[17] = input_ptrs[19];
+      inptrs[18] = input_ptrs[20];
+      inptrs[19] = input_ptrs[14];
+      inptrs[20] = input_ptrs[21];
+      inptrs[21] = input_ptrs[17];
+      inptrs[22] = input_ptrs[23];
+      inptrs[23] = input_ptrs[22];
+      inptrs[24] = input_ptrs[24];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "cntw x14\n"
+    "ldp x13, x12, [x20, #0x0]\n"
+    "ldp x11, x10, [x20, #0x10]\n"
+    "mov x9, #0x0\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z20.s }, p3/Z, [x16]\n"
+    "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+    "cmp x14, %x[n_channels]\n"
+    "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+    "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+    "sub x28, XZR, x14\n"
+    "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+    "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+    "addvl x16, x16, #-6\n"
+    "ld1w { z9.s }, p2/Z, [x27, x9, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z14.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z15.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z24, z20\n fmla z24.s, p3/M, z8.s, z9.s\n"
+    "movprfx z23, z20\n fmla z23.s, p3/M, z6.s, z9.s\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "fmla z23.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x15, #0x50]\n"
+    "fmla z24.s, p3/M, z1.s, z11.s\n"
+    "fmla z23.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z24.s, p3/M, z3.s, z14.s\n"
+    "fmla z23.s, p3/M, z0.s, z16.s\n"
+    "ldr x20, [x15, #0x58]\n"
+    "ldr x22, [x15, #0x78]\n"
+    "fmla z24.s, p3/M, z4.s, z15.s\n"
+    "fmla z23.s, p3/M, z4.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x60]\n"
+    "fmla z24.s, p3/M, z2.s, z16.s\n"
+    "fmla z23.s, p3/M, z5.s, z18.s\n"
+    "ldr x20, [x15, #0x80]\n"
+    "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "movprfx z22, z20\n fmla z22.s, p3/M, z2.s, z9.s\n"
+    "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x68]\n"
+    "fmla z24.s, p3/M, z5.s, z19.s\n"
+    "fmla z23.s, p3/M, z3.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla z22.s, p3/M, z3.s, z17.s\n"
+    "fmla z21.s, p3/M, z4.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z0.s, z18.s\n"
+    "fmla z21.s, p3/M, z1.s, z20.s\n"
+    "ldr x21, [x15, #0x70]\n"
+    "ldr x20, [x15, #0x98]\n"
+    "fmla z22.s, p3/M, z4.s, z17.s\n"
+    "fmla z21.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z24.s, p3/M, z6.s, z18.s\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "fmla z22.s, p3/M, z1.s, z16.s\n"
+    "fmla z21.s, p3/M, z2.s, z19.s\n"
+    "fmla z24.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0xa0]\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "fmla z22.s, p3/M, z6.s, z16.s\n"
+    "fmla z21.s, p3/M, z3.s, z18.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z17.s\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla z23.s, p3/M, z7.s, z20.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z5.s, z18.s\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla z21.s, p3/M, z6.s, z17.s\n"
+    "fmla z23.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z8.s, z17.s\n"
+    "fmla z21.s, p3/M, z8.s, z16.s\n"
+    "whilelt p1.s, x14, %x[n_channels]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "incw x9\n"
+    "fmax z24.s, p3/M, z24.s, z26.s\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ld1w { z9.s }, p1/Z, [x27, x14, LSL #2]\n"
+    "fmax z23.s, p3/M, z23.s, z26.s\n"
+    "fmax z22.s, p3/M, z22.s, z26.s\n"
+    "ld1w { z10.s }, p1/Z, [x26, x14, LSL #2]\n"
+    "ld1w { z11.s }, p1/Z, [x25, x14, LSL #2]\n"
+    "fmax z21.s, p3/M, z21.s, z26.s\n"
+    "incw x28\n"
+    "ld1w { z12.s }, p1/Z, [x24, x14, LSL #2]\n"
+    "ld1w { z13.s }, p1/Z, [x23, x14, LSL #2]\n"
+    "mov p0.b, p2.b\n"
+    "whilelt p2.s, x9, %x[n_channels]\n"
+    "ld1w { z14.s }, p1/Z, [x22, x14, LSL #2]\n"
+    "ld1w { z15.s }, p1/Z, [x21, x14, LSL #2]\n"
+    "fmin z24.s, p3/M, z24.s, z25.s\n"
+    "fmin z23.s, p3/M, z23.s, z25.s\n"
+    "ld1w { z16.s }, p1/Z, [x20, x14, LSL #2]\n"
+    "incw x14\n"
+    "ld1w { z20.s }, p3/Z, [x16]\n"
+    "cmp x14, %x[n_channels]\n"
+    "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+    "fmin z22.s, p3/M, z22.s, z25.s\n"
+    "fmin z21.s, p3/M, z21.s, z25.s\n"
+    "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+    "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+    "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+    "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
+    "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
+    "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+    "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
+    "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+    "addvl x16, x16, #-6\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z24, z20\n fmla z24.s, p3/M, z8.s, z9.s\n"
+    "movprfx z23, z20\n fmla z23.s, p3/M, z6.s, z9.s\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "fmla z23.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x15, #0x50]\n"
+    "fmla z24.s, p3/M, z1.s, z11.s\n"
+    "fmla z23.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z24.s, p3/M, z3.s, z14.s\n"
+    "fmla z23.s, p3/M, z0.s, z16.s\n"
+    "ldr x20, [x15, #0x58]\n"
+    "ldr x22, [x15, #0x78]\n"
+    "fmla z24.s, p3/M, z4.s, z15.s\n"
+    "fmla z23.s, p3/M, z4.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x60]\n"
+    "fmla z24.s, p3/M, z2.s, z16.s\n"
+    "fmla z23.s, p3/M, z5.s, z18.s\n"
+    "ldr x20, [x15, #0x80]\n"
+    "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "movprfx z22, z20\n fmla z22.s, p3/M, z2.s, z9.s\n"
+    "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x68]\n"
+    "fmla z24.s, p3/M, z5.s, z19.s\n"
+    "fmla z23.s, p3/M, z3.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla z22.s, p3/M, z3.s, z17.s\n"
+    "fmla z21.s, p3/M, z4.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z0.s, z18.s\n"
+    "fmla z21.s, p3/M, z1.s, z20.s\n"
+    "ldr x21, [x15, #0x70]\n"
+    "ldr x20, [x15, #0x98]\n"
+    "fmla z22.s, p3/M, z4.s, z17.s\n"
+    "fmla z21.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z24.s, p3/M, z6.s, z18.s\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "fmla z22.s, p3/M, z1.s, z16.s\n"
+    "fmla z21.s, p3/M, z2.s, z19.s\n"
+    "fmla z24.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0xa0]\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "fmla z22.s, p3/M, z6.s, z16.s\n"
+    "fmla z21.s, p3/M, z3.s, z18.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z17.s\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla z23.s, p3/M, z7.s, z20.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z5.s, z18.s\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla z21.s, p3/M, z6.s, z17.s\n"
+    "fmla z23.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z8.s, z17.s\n"
+    "fmla z21.s, p3/M, z8.s, z16.s\n"
+    "incw x28\n"
+    "mov p0.b, p2.b\n"
+    "fmax z24.s, p3/M, z24.s, z26.s\n"
+    "fmax z23.s, p3/M, z23.s, z26.s\n"
+    "fmax z22.s, p3/M, z22.s, z26.s\n"
+    "fmax z21.s, p3/M, z21.s, z26.s\n"
+    "fmin z24.s, p3/M, z24.s, z25.s\n"
+    "fmin z23.s, p3/M, z23.s, z25.s\n"
+    "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+    "fmin z22.s, p3/M, z22.s, z25.s\n"
+    "fmin z21.s, p3/M, z21.s, z25.s\n"
+    "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
+    "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
+    "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..ae89a64c6b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..075181a488
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x12, #0x0\n"
+    "mov x8, #0x0\n"
+    "1:"  // Tile loop
+    "str x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x2\n"
+    "mov x24, #0x2\n"
+    "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x12, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x8, x17, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "add x15, x17, x17\n"
+    "mul x20, x12, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "cntw x12\n"
+    "mul x22, x22, x25\n"  // offset *= kernel_stride * output_size
+    "add x14, x14, x22, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x11, x14, x23, LSL #2\n"
+    "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x8, x16, x20\n"  // offset += tile_j * ld_output_col
+    "add x9, x11, x23, LSL #2\n"
+    "add x28, x15, x17\n"
+    "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "add x27, x9, x23, LSL #2\n"
+    "ld1rw { z28.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x26, x28, x17\n"
+    "add x25, x27, x23, LSL #2\n"
+    "ld1w { z29.s }, p3/Z, [x10]\n"
+    "ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
+    "add x24, x26, x17\n"
+    "add x13, x13, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
+    "ld1w { z2.s }, p3/Z, [x10, #3, MUL VL]\n"
+    "cmp x12, %x[n_channels]\n"
+    "add x23, x25, x23, LSL #2\n"
+    "ld1w { z3.s }, p3/Z, [x10, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x10, #5, MUL VL]\n"
+    "add x22, x13, x21, LSL #2\n"
+    "mov x21, #0x0\n"
+    "ld1w { z5.s }, p2/Z, [x14]\n"
+    "ld1w { z6.s }, p2/Z, [x14, x17, LSL #2]\n"
+    "sub x20, XZR, x12\n"
+    "ld1w { z7.s }, p2/Z, [x11]\n"
+    "ld1w { z8.s }, p2/Z, [x11, x17, LSL #2]\n"
+    "addvl x10, x10, #6\n"
+    "ld1w { z9.s }, p2/Z, [x14, x15, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x11, x15, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x14, x28, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x14, x26, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x11, x24, LSL #2]\n"
+    "ld1w { z14.s }, p2/Z, [x9]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z27, z29\n fmla z27.s, p3/M, z0.s, z5.s\n"
+    "movprfx z31, z29\n fmla z31.s, p3/M, z0.s, z6.s\n"
+    "ld1w { z24.s }, p2/Z, [x11, x28, LSL #2]\n"
+    "whilelt p1.s, x12, %x[n_channels]\n"
+    "movprfx z26, z29\n fmla z26.s, p3/M, z0.s, z7.s\n"
+    "movprfx z30, z29\n fmla z30.s, p3/M, z0.s, z8.s\n"
+    "ld1w { z18.s }, p3/Z, [x10]\n"
+    "incw x21\n"
+    "fmla z27.s, p3/M, z1.s, z6.s\n"
+    "fmla z31.s, p3/M, z1.s, z9.s\n"
+    "ld1w { z23.s }, p2/Z, [x11, x26, LSL #2]\n"
+    "incw x12\n"
+    "fmla z26.s, p3/M, z1.s, z8.s\n"
+    "fmla z30.s, p3/M, z1.s, z13.s\n"
+    "ld1w { z22.s }, p3/Z, [x10, #1, MUL VL]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z27.s, p3/M, z2.s, z9.s\n"
+    "fmla z31.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x24, LSL #2]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "fmla z30.s, p3/M, z2.s, z24.s\n"
+    "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
+    "addvl x11, x11, #1\n"
+    "fmla z27.s, p3/M, z3.s, z11.s\n"
+    "fmla z31.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z0.s }, p2/Z, [x9, x17, LSL #2]\n"
+    "incw x20\n"
+    "fmla z26.s, p3/M, z3.s, z24.s\n"
+    "fmla z30.s, p3/M, z3.s, z23.s\n"
+    "ld1w { z17.s }, p3/Z, [x10, #3, MUL VL]\n"
+    "fmla z27.s, p3/M, z4.s, z12.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x9, x15, LSL #2]\n"
+    "ld1w { z5.s }, p2/Z, [x9, x28, LSL #2]\n"
+    "fmla z26.s, p3/M, z4.s, z23.s\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z21.s }, p3/Z, [x10, #4, MUL VL]\n"
+    "fmla z27.s, p3/M, z18.s, z7.s\n"
+    "fmla z31.s, p3/M, z18.s, z8.s\n"
+    "ld1w { z7.s }, p1/Z, [x11]\n"
+    "fmla z26.s, p3/M, z18.s, z14.s\n"
+    "fmla z30.s, p3/M, z18.s, z0.s\n"
+    "ld1w { z18.s }, p3/Z, [x10, #5, MUL VL]\n"
+    "fmla z27.s, p3/M, z22.s, z8.s\n"
+    "fmla z31.s, p3/M, z22.s, z13.s\n"
+    "ld1w { z3.s }, p2/Z, [x9, x24, LSL #2]\n"
+    "fmla z26.s, p3/M, z22.s, z0.s\n"
+    "fmla z30.s, p3/M, z22.s, z19.s\n"
+    "ld1w { z8.s }, p3/Z, [x10, #6, MUL VL]\n"
+    "fmla z27.s, p3/M, z20.s, z13.s\n"
+    "fmla z31.s, p3/M, z20.s, z24.s\n"
+    "ld1w { z2.s }, p2/Z, [x9, x26, LSL #2]\n"
+    "addvl x9, x9, #1\n"
+    "fmla z26.s, p3/M, z20.s, z19.s\n"
+    "fmla z30.s, p3/M, z20.s, z5.s\n"
+    "ld1w { z16.s }, p3/Z, [x10, #7, MUL VL]\n"
+    "addvl x10, x10, #16\n"
+    "fmla z27.s, p3/M, z17.s, z24.s\n"
+    "fmla z31.s, p3/M, z17.s, z23.s\n"
+    "ld1w { z25.s }, p2/Z, [x27]\n"
+    "ld1w { z29.s }, p3/Z, [x10, #4, MUL VL]\n"
+    "fmla z26.s, p3/M, z17.s, z5.s\n"
+    "fmla z30.s, p3/M, z17.s, z2.s\n"
+    "ld1w { z17.s }, p3/Z, [x10, #-8, MUL VL]\n"
+    "fmla z27.s, p3/M, z21.s, z23.s\n"
+    "fmla z31.s, p3/M, z21.s, z10.s\n"
+    "ld1w { z24.s }, p2/Z, [x27, x17, LSL #2]\n"
+    "ld1w { z22.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "fmla z26.s, p3/M, z21.s, z2.s\n"
+    "fmla z30.s, p3/M, z21.s, z3.s\n"
+    "ld1w { z21.s }, p3/Z, [x10, #-7, MUL VL]\n"
+    "fmla z27.s, p3/M, z18.s, z14.s\n"
+    "fmla z31.s, p3/M, z18.s, z0.s\n"
+    "ld1w { z1.s }, p2/Z, [x27, x24, LSL #2]\n"
+    "fmla z26.s, p3/M, z18.s, z25.s\n"
+    "fmla z30.s, p3/M, z18.s, z24.s\n"
+    "ld1w { z23.s }, p3/Z, [x10, #-6, MUL VL]\n"
+    "fmla z27.s, p3/M, z8.s, z0.s\n"
+    "fmla z31.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z0.s }, p2/Z, [x27, x28, LSL #2]\n"
+    "fmla z26.s, p3/M, z8.s, z24.s\n"
+    "fmla z30.s, p3/M, z8.s, z22.s\n"
+    "ld1w { z20.s }, p3/Z, [x10, #-5, MUL VL]\n"
+    "fmla z27.s, p3/M, z16.s, z19.s\n"
+    "fmla z31.s, p3/M, z16.s, z5.s\n"
+    "ld1w { z19.s }, p2/Z, [x27, x26, LSL #2]\n"
+    "addvl x27, x27, #1\n"
+    "fmla z26.s, p3/M, z16.s, z22.s\n"
+    "fmla z30.s, p3/M, z16.s, z0.s\n"
+    "ld1w { z18.s }, p3/Z, [x10, #-4, MUL VL]\n"
+    "fmla z27.s, p3/M, z17.s, z5.s\n"
+    "fmla z31.s, p3/M, z17.s, z2.s\n"
+    "ld1w { z16.s }, p2/Z, [x25]\n"
+    "fmla z26.s, p3/M, z17.s, z0.s\n"
+    "fmla z30.s, p3/M, z17.s, z19.s\n"
+    "ld1w { z17.s }, p3/Z, [x10, #-3, MUL VL]\n"
+    "fmla z27.s, p3/M, z21.s, z2.s\n"
+    "fmla z31.s, p3/M, z21.s, z3.s\n"
+    "ld1w { z4.s }, p2/Z, [x25, x17, LSL #2]\n"
+    "ld1w { z8.s }, p2/Z, [x25, x26, LSL #2]\n"
+    "fmla z26.s, p3/M, z21.s, z19.s\n"
+    "fmla z30.s, p3/M, z21.s, z1.s\n"
+    "ld1w { z13.s }, p3/Z, [x10, #-2, MUL VL]\n"
+    "fmla z27.s, p3/M, z23.s, z25.s\n"
+    "fmla z31.s, p3/M, z23.s, z24.s\n"
+    "ld1w { z25.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z26.s, p3/M, z23.s, z16.s\n"
+    "fmla z30.s, p3/M, z23.s, z4.s\n"
+    "ld1w { z5.s }, p3/Z, [x10, #-1, MUL VL]\n"
+    "fmla z27.s, p3/M, z20.s, z24.s\n"
+    "fmla z31.s, p3/M, z20.s, z22.s\n"
+    "ld1w { z24.s }, p2/Z, [x25, x28, LSL #2]\n"
+    "fmla z26.s, p3/M, z20.s, z4.s\n"
+    "fmla z30.s, p3/M, z20.s, z25.s\n"
+    "ld1w { z23.s }, p3/Z, [x10]\n"
+    "fmla z27.s, p3/M, z18.s, z22.s\n"
+    "fmla z31.s, p3/M, z18.s, z0.s\n"
+    "ld1w { z22.s }, p2/Z, [x25, x24, LSL #2]\n"
+    "addvl x25, x25, #1\n"
+    "fmla z26.s, p3/M, z18.s, z25.s\n"
+    "fmla z30.s, p3/M, z18.s, z24.s\n"
+    "ld1w { z21.s }, p3/Z, [x10, #1, MUL VL]\n"
+    "fmla z27.s, p3/M, z17.s, z0.s\n"
+    "fmla z31.s, p3/M, z17.s, z19.s\n"
+    "ld1w { z18.s }, p2/Z, [x23]\n"
+    "fmla z26.s, p3/M, z17.s, z24.s\n"
+    "fmla z30.s, p3/M, z17.s, z8.s\n"
+    "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
+    "fmla z27.s, p3/M, z13.s, z19.s\n"
+    "fmla z31.s, p3/M, z13.s, z1.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x17, LSL #2]\n"
+    "ld1w { z14.s }, p1/Z, [x9]\n"
+    "fmla z26.s, p3/M, z13.s, z8.s\n"
+    "fmla z30.s, p3/M, z13.s, z22.s\n"
+    "ld1w { z19.s }, p3/Z, [x10, #3, MUL VL]\n"
+    "fmla z27.s, p3/M, z5.s, z16.s\n"
+    "fmla z31.s, p3/M, z5.s, z4.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z26.s, p3/M, z5.s, z18.s\n"
+    "fmla z30.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x23, x28, LSL #2]\n"
+    "ld1w { z0.s }, p3/Z, [x10, #5, MUL VL]\n"
+    "fmla z27.s, p3/M, z23.s, z4.s\n"
+    "fmla z31.s, p3/M, z23.s, z25.s\n"
+    "ld1w { z13.s }, p1/Z, [x11, x15, LSL #2]\n"
+    "fmla z26.s, p3/M, z23.s, z17.s\n"
+    "fmla z30.s, p3/M, z23.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z1.s }, p3/Z, [x10, #6, MUL VL]\n"
+    "fmla z27.s, p3/M, z21.s, z25.s\n"
+    "fmla z31.s, p3/M, z21.s, z24.s\n"
+    "ld1w { z5.s }, p1/Z, [x14]\n"
+    "fmla z26.s, p3/M, z21.s, z16.s\n"
+    "fmla z30.s, p3/M, z21.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+    "ld1w { z2.s }, p3/Z, [x10, #7, MUL VL]\n"
+    "fmla z27.s, p3/M, z20.s, z24.s\n"
+    "fmla z31.s, p3/M, z20.s, z8.s\n"
+    "addvl x10, x10, #16\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "fmla z26.s, p3/M, z20.s, z18.s\n"
+    "fmla z30.s, p3/M, z20.s, z17.s\n"
+    "cmp x12, %x[n_channels]\n"
+    "addvl x23, x23, #1\n"
+    "fmla z27.s, p3/M, z19.s, z8.s\n"
+    "fmla z31.s, p3/M, z19.s, z22.s\n"
+    "fmax z27.s, p3/M, z27.s, z15.s\n"
+    "fmax z31.s, p3/M, z31.s, z15.s\n"
+    "fmla z26.s, p3/M, z19.s, z17.s\n"
+    "fmla z30.s, p3/M, z19.s, z16.s\n"
+    "fmax z26.s, p3/M, z26.s, z15.s\n"
+    "fmax z30.s, p3/M, z30.s, z15.s\n"
+    "fmin z27.s, p3/M, z27.s, z28.s\n"
+    "fmin z31.s, p3/M, z31.s, z28.s\n"
+    "ld1w { z6.s }, p1/Z, [x14, x17, LSL #2]\n"
+    "ld1w { z8.s }, p1/Z, [x11, x17, LSL #2]\n"
+    "fmin z26.s, p3/M, z26.s, z28.s\n"
+    "fmin z30.s, p3/M, z30.s, z28.s\n"
+    "ld1w { z9.s }, p1/Z, [x14, x15, LSL #2]\n"
+    "ld1w { z11.s }, p1/Z, [x14, x28, LSL #2]\n"
+    "ld1w { z12.s }, p1/Z, [x14, x26, LSL #2]\n"
+    "ld1w { z10.s }, p1/Z, [x11, x24, LSL #2]\n"
+    "st1w { z27.s }, p0, [x13]\n"
+    "st1w { z31.s }, p0, [x13, x16, LSL #2]\n"
+    "addvl x13, x13, #1\n"
+    "ld1w { z3.s }, p3/Z, [x10, #-8, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x10, #-7, MUL VL]\n"
+    "st1w { z26.s }, p0, [x22]\n"
+    "addvl x10, x10, #-6\n"
+    "st1w { z30.s }, p0, [x22, x16, LSL #2]\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z30, z29\n fmla z30.s, p3/M, z0.s, z5.s\n"
+    "movprfx z31, z29\n fmla z31.s, p3/M, z0.s, z6.s\n"
+    "ld1w { z22.s }, p2/Z, [x11, x28, LSL #2]\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "movprfx z5, z29\n fmla z5.s, p3/M, z0.s, z7.s\n"
+    "fmla z29.s, p3/M, z0.s, z8.s\n"
+    "ld1w { z20.s }, p3/Z, [x10]\n"
+    "ldr x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "fmla z30.s, p3/M, z1.s, z6.s\n"
+    "fmla z31.s, p3/M, z1.s, z9.s\n"
+    "ld1w { z6.s }, p2/Z, [x11, x26, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "fmla z5.s, p3/M, z1.s, z8.s\n"
+    "fmla z29.s, p3/M, z1.s, z13.s\n"
+    "ld1w { z19.s }, p3/Z, [x10, #1, MUL VL]\n"
+    "add x8, x8, #0x1\n"
+    "fmla z30.s, p3/M, z2.s, z9.s\n"
+    "fmla z31.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x24, LSL #2]\n"
+    "cmp x8, x20\n"
+    "fmla z5.s, p3/M, z2.s, z13.s\n"
+    "fmla z29.s, p3/M, z2.s, z22.s\n"
+    "ld1w { z18.s }, p3/Z, [x10, #2, MUL VL]\n"
+    "add x21, x12, #0x1\n"
+    "fmla z30.s, p3/M, z3.s, z11.s\n"
+    "fmla z31.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z1.s }, p2/Z, [x9, x17, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z5.s, p3/M, z3.s, z22.s\n"
+    "fmla z29.s, p3/M, z3.s, z6.s\n"
+    "ld1w { z17.s }, p3/Z, [x10, #3, MUL VL]\n"
+    "csel x12, x12, x21, LT\n"
+    "fmla z30.s, p3/M, z4.s, z12.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "ld1w { z0.s }, p2/Z, [x9, x15, LSL #2]\n"
+    "ld1w { z27.s }, p2/Z, [x9, x28, LSL #2]\n"
+    "fmla z5.s, p3/M, z4.s, z6.s\n"
+    "fmla z29.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z16.s }, p3/Z, [x10, #4, MUL VL]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.s, p3/M, z20.s, z7.s\n"
+    "fmla z31.s, p3/M, z20.s, z8.s\n"
+    "csel x8, x8, XZR, LT\n"
+    "cmp x12, x20\n"
+    "fmla z5.s, p3/M, z20.s, z14.s\n"
+    "fmla z29.s, p3/M, z20.s, z1.s\n"
+    "ld1w { z21.s }, p3/Z, [x10, #5, MUL VL]\n"
+    "fmla z30.s, p3/M, z19.s, z8.s\n"
+    "fmla z31.s, p3/M, z19.s, z13.s\n"
+    "ld1w { z26.s }, p2/Z, [x9, x24, LSL #2]\n"
+    "fmla z5.s, p3/M, z19.s, z1.s\n"
+    "fmla z29.s, p3/M, z19.s, z0.s\n"
+    "ld1w { z25.s }, p3/Z, [x10, #6, MUL VL]\n"
+    "fmla z30.s, p3/M, z18.s, z13.s\n"
+    "fmla z31.s, p3/M, z18.s, z22.s\n"
+    "ld1w { z24.s }, p2/Z, [x9, x26, LSL #2]\n"
+    "fmla z5.s, p3/M, z18.s, z0.s\n"
+    "fmla z29.s, p3/M, z18.s, z27.s\n"
+    "ld1w { z23.s }, p3/Z, [x10, #7, MUL VL]\n"
+    "addvl x10, x10, #16\n"
+    "fmla z30.s, p3/M, z17.s, z22.s\n"
+    "fmla z31.s, p3/M, z17.s, z6.s\n"
+    "ld1w { z22.s }, p2/Z, [x27]\n"
+    "fmla z5.s, p3/M, z17.s, z27.s\n"
+    "fmla z29.s, p3/M, z17.s, z24.s\n"
+    "ld1w { z20.s }, p3/Z, [x10, #-8, MUL VL]\n"
+    "fmla z30.s, p3/M, z16.s, z6.s\n"
+    "fmla z31.s, p3/M, z16.s, z10.s\n"
+    "ld1w { z19.s }, p2/Z, [x27, x17, LSL #2]\n"
+    "ld1w { z18.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "fmla z5.s, p3/M, z16.s, z24.s\n"
+    "fmla z29.s, p3/M, z16.s, z26.s\n"
+    "ld1w { z16.s }, p3/Z, [x10, #-7, MUL VL]\n"
+    "fmla z30.s, p3/M, z21.s, z14.s\n"
+    "fmla z31.s, p3/M, z21.s, z1.s\n"
+    "ld1w { z17.s }, p2/Z, [x27, x24, LSL #2]\n"
+    "fmla z5.s, p3/M, z21.s, z22.s\n"
+    "fmla z29.s, p3/M, z21.s, z19.s\n"
+    "ld1w { z21.s }, p3/Z, [x10, #-6, MUL VL]\n"
+    "fmla z30.s, p3/M, z25.s, z1.s\n"
+    "fmla z31.s, p3/M, z25.s, z0.s\n"
+    "ld1w { z7.s }, p2/Z, [x27, x28, LSL #2]\n"
+    "fmla z5.s, p3/M, z25.s, z19.s\n"
+    "fmla z29.s, p3/M, z25.s, z18.s\n"
+    "ld1w { z10.s }, p3/Z, [x10, #-5, MUL VL]\n"
+    "fmla z30.s, p3/M, z23.s, z0.s\n"
+    "fmla z31.s, p3/M, z23.s, z27.s\n"
+    "ld1w { z11.s }, p2/Z, [x27, x26, LSL #2]\n"
+    "fmla z5.s, p3/M, z23.s, z18.s\n"
+    "fmla z29.s, p3/M, z23.s, z7.s\n"
+    "ld1w { z6.s }, p3/Z, [x10, #-4, MUL VL]\n"
+    "fmla z30.s, p3/M, z20.s, z27.s\n"
+    "fmla z31.s, p3/M, z20.s, z24.s\n"
+    "ld1w { z0.s }, p2/Z, [x25]\n"
+    "fmla z5.s, p3/M, z20.s, z7.s\n"
+    "fmla z29.s, p3/M, z20.s, z11.s\n"
+    "ld1w { z9.s }, p3/Z, [x10, #-3, MUL VL]\n"
+    "fmla z30.s, p3/M, z16.s, z24.s\n"
+    "fmla z31.s, p3/M, z16.s, z26.s\n"
+    "ld1w { z3.s }, p2/Z, [x25, x17, LSL #2]\n"
+    "ld1w { z27.s }, p2/Z, [x25, x26, LSL #2]\n"
+    "fmla z5.s, p3/M, z16.s, z11.s\n"
+    "fmla z29.s, p3/M, z16.s, z17.s\n"
+    "ld1w { z16.s }, p3/Z, [x10, #-2, MUL VL]\n"
+    "fmla z30.s, p3/M, z21.s, z22.s\n"
+    "fmla z31.s, p3/M, z21.s, z19.s\n"
+    "ld1w { z26.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z5.s, p3/M, z21.s, z0.s\n"
+    "fmla z29.s, p3/M, z21.s, z3.s\n"
+    "ld1w { z25.s }, p3/Z, [x10, #-1, MUL VL]\n"
+    "fmla z30.s, p3/M, z10.s, z19.s\n"
+    "fmla z31.s, p3/M, z10.s, z18.s\n"
+    "ld1w { z24.s }, p2/Z, [x25, x28, LSL #2]\n"
+    "fmla z5.s, p3/M, z10.s, z3.s\n"
+    "fmla z29.s, p3/M, z10.s, z26.s\n"
+    "ld1w { z23.s }, p3/Z, [x10]\n"
+    "fmla z30.s, p3/M, z6.s, z18.s\n"
+    "fmla z31.s, p3/M, z6.s, z7.s\n"
+    "ld1w { z22.s }, p2/Z, [x25, x24, LSL #2]\n"
+    "fmla z5.s, p3/M, z6.s, z26.s\n"
+    "fmla z29.s, p3/M, z6.s, z24.s\n"
+    "ld1w { z21.s }, p3/Z, [x10, #1, MUL VL]\n"
+    "fmla z30.s, p3/M, z9.s, z7.s\n"
+    "fmla z31.s, p3/M, z9.s, z11.s\n"
+    "ld1w { z18.s }, p2/Z, [x23]\n"
+    "fmla z5.s, p3/M, z9.s, z24.s\n"
+    "fmla z29.s, p3/M, z9.s, z27.s\n"
+    "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
+    "fmla z30.s, p3/M, z16.s, z11.s\n"
+    "fmla z31.s, p3/M, z16.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x17, LSL #2]\n"
+    "fmla z5.s, p3/M, z16.s, z27.s\n"
+    "fmla z29.s, p3/M, z16.s, z22.s\n"
+    "ld1w { z19.s }, p3/Z, [x10, #3, MUL VL]\n"
+    "fmla z30.s, p3/M, z25.s, z0.s\n"
+    "fmla z31.s, p3/M, z25.s, z3.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z5.s, p3/M, z25.s, z18.s\n"
+    "fmla z29.s, p3/M, z25.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x23, x28, LSL #2]\n"
+    "fmla z30.s, p3/M, z23.s, z3.s\n"
+    "fmla z31.s, p3/M, z23.s, z26.s\n"
+    "fmla z5.s, p3/M, z23.s, z17.s\n"
+    "fmla z29.s, p3/M, z23.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x26, LSL #2]\n"
+    "fmla z30.s, p3/M, z21.s, z26.s\n"
+    "fmla z31.s, p3/M, z21.s, z24.s\n"
+    "fmla z5.s, p3/M, z21.s, z16.s\n"
+    "fmla z29.s, p3/M, z21.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+    "fmla z30.s, p3/M, z20.s, z24.s\n"
+    "fmla z31.s, p3/M, z20.s, z27.s\n"
+    "fmla z5.s, p3/M, z20.s, z18.s\n"
+    "fmla z29.s, p3/M, z20.s, z17.s\n"
+    "fmla z30.s, p3/M, z19.s, z27.s\n"
+    "fmla z31.s, p3/M, z19.s, z22.s\n"
+    "fmax z30.s, p3/M, z30.s, z15.s\n"
+    "fmax z31.s, p3/M, z31.s, z15.s\n"
+    "fmla z5.s, p3/M, z19.s, z17.s\n"
+    "fmla z29.s, p3/M, z19.s, z16.s\n"
+    "fmax z5.s, p3/M, z5.s, z15.s\n"
+    "fmax z29.s, p3/M, z29.s, z15.s\n"
+    "fmin z30.s, p3/M, z30.s, z28.s\n"
+    "fmin z31.s, p3/M, z31.s, z28.s\n"
+    "st1w { z30.s }, p0, [x13]\n"
+    "fmin z5.s, p3/M, z5.s, z28.s\n"
+    "fmin z29.s, p3/M, z29.s, z28.s\n"
+    "st1w { z31.s }, p0, [x13, x16, LSL #2]\n"
+    "st1w { z5.s }, p0, [x22]\n"
+    "st1w { z29.s }, p0, [x22, x16, LSL #2]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..bf65e04d32
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[36];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[0];
+      inptrs[1] = input_ptrs[1];
+      inptrs[2] = input_ptrs[6];
+      inptrs[3] = input_ptrs[7];
+      inptrs[4] = input_ptrs[2];
+      inptrs[5] = input_ptrs[8];
+      inptrs[6] = input_ptrs[3];
+      inptrs[7] = input_ptrs[4];
+      inptrs[8] = input_ptrs[11];
+      inptrs[9] = input_ptrs[12];
+      inptrs[10] = input_ptrs[9];
+      inptrs[11] = input_ptrs[10];
+      inptrs[12] = input_ptrs[5];
+      inptrs[13] = input_ptrs[13];
+      inptrs[14] = input_ptrs[14];
+      inptrs[15] = input_ptrs[15];
+      inptrs[16] = input_ptrs[16];
+      inptrs[17] = input_ptrs[17];
+      inptrs[18] = input_ptrs[18];
+      inptrs[19] = input_ptrs[19];
+      inptrs[20] = input_ptrs[20];
+      inptrs[21] = input_ptrs[21];
+      inptrs[22] = input_ptrs[22];
+      inptrs[23] = input_ptrs[23];
+      inptrs[24] = input_ptrs[24];
+      inptrs[25] = input_ptrs[25];
+      inptrs[26] = input_ptrs[26];
+      inptrs[27] = input_ptrs[27];
+      inptrs[28] = input_ptrs[28];
+      inptrs[29] = input_ptrs[29];
+      inptrs[30] = input_ptrs[30];
+      inptrs[31] = input_ptrs[31];
+      inptrs[32] = input_ptrs[32];
+      inptrs[33] = input_ptrs[33];
+      inptrs[34] = input_ptrs[34];
+      inptrs[35] = input_ptrs[35];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldp x15, x14, [x20, #0x0]\n"
+    "mov x13, #0x0\n"
+    "ldp x12, x11, [x20, #0x10]\n"
+    "whilelt p3.s, XZR, %x[n_channels]\n"
+    "ldp x21, x20, [x16, #0x0]\n"
+    "cntw x10\n"
+    "ptrue p2.b\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_params]]\n"
+    "ld1w { z5.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "cmp x10, %x[n_channels]\n"
+    "ld1w { z6.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldp x27, x26, [x16, #0x10]\n"
+    "sub x28, XZR, x10\n"
+    "ldp x25, x24, [x16, #0x20]\n"
+    "ldp x23, x22, [x16, #0x30]\n"
+    "ldp x21, x20, [x16, #0x40]\n"
+    "ld1rw { z15.s }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z28.s }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1w { z29.s }, p2/Z, [x9]\n"
+    "ld1w { z0.s }, p2/Z, [x9, #1, MUL VL]\n"
+    "ld1w { z1.s }, p2/Z, [x9, #2, MUL VL]\n"
+    "ld1w { z2.s }, p2/Z, [x9, #3, MUL VL]\n"
+    "ld1w { z3.s }, p2/Z, [x9, #4, MUL VL]\n"
+    "ld1w { z4.s }, p2/Z, [x9, #5, MUL VL]\n"
+    "ld1w { z7.s }, p3/Z, [x27, x13, LSL #2]\n"
+    "addvl x9, x9, #6\n"
+    "ld1w { z8.s }, p3/Z, [x26, x13, LSL #2]\n"
+    "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
+    "ld1w { z13.s }, p3/Z, [x24, x13, LSL #2]\n"
+    "ld1w { z11.s }, p3/Z, [x23, x13, LSL #2]\n"
+    "ld1w { z12.s }, p3/Z, [x22, x13, LSL #2]\n"
+    "ld1w { z10.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "ld1w { z14.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z30, z29\n fmla z30.s, p2/M, z0.s, z5.s\n"
+    "movprfx z27, z29\n fmla z27.s, p2/M, z0.s, z6.s\n"
+    "ldr x20, [x16, #0x50]\n"
+    "ld1w { z5.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "movprfx z31, z29\n fmla z31.s, p2/M, z0.s, z7.s\n"
+    "movprfx z26, z29\n fmla z26.s, p2/M, z0.s, z8.s\n"
+    "ldr x20, [x16, #0x58]\n"
+    "ldr x21, [x16, #0x60]\n"
+    "fmla z30.s, p2/M, z1.s, z6.s\n"
+    "fmla z27.s, p2/M, z1.s, z9.s\n"
+    "ld1w { z22.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldr x20, [x16, #0x68]\n"
+    "fmla z31.s, p2/M, z1.s, z8.s\n"
+    "fmla z26.s, p2/M, z1.s, z13.s\n"
+    "ld1w { z21.s }, p2/Z, [x9]\n"
+    "ldr x23, [x16, #0x70]\n"
+    "fmla z30.s, p2/M, z2.s, z9.s\n"
+    "fmla z27.s, p2/M, z2.s, z11.s\n"
+    "ld1w { z20.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "ld1w { z18.s }, p2/Z, [x9, #1, MUL VL]\n"
+    "fmla z31.s, p2/M, z2.s, z13.s\n"
+    "fmla z26.s, p2/M, z2.s, z5.s\n"
+    "ldr x22, [x16, #0x78]\n"
+    "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
+    "fmla z30.s, p2/M, z3.s, z11.s\n"
+    "fmla z27.s, p2/M, z3.s, z12.s\n"
+    "ld1w { z11.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldr x21, [x16, #0x80]\n"
+    "fmla z31.s, p2/M, z3.s, z5.s\n"
+    "fmla z26.s, p2/M, z3.s, z22.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, #3, MUL VL]\n"
+    "ldr x20, [x16, #0x88]\n"
+    "fmla z30.s, p2/M, z4.s, z12.s\n"
+    "fmla z27.s, p2/M, z4.s, z20.s\n"
+    "ld1w { z0.s }, p3/Z, [x23, x13, LSL #2]\n"
+    "ld1w { z29.s }, p3/Z, [x22, x13, LSL #2]\n"
+    "fmla z31.s, p2/M, z4.s, z22.s\n"
+    "fmla z26.s, p2/M, z4.s, z10.s\n"
+    "ld1w { z19.s }, p2/Z, [x9, #4, MUL VL]\n"
+    "ldr x23, [x16, #0x90]\n"
+    "fmla z30.s, p2/M, z21.s, z7.s\n"
+    "fmla z27.s, p2/M, z21.s, z8.s\n"
+    "ldr x26, [x16, #0x98]\n"
+    "ldr x22, [x16, #0xa0]\n"
+    "fmla z31.s, p2/M, z21.s, z14.s\n"
+    "fmla z26.s, p2/M, z21.s, z11.s\n"
+    "ld1w { z25.s }, p2/Z, [x9, #5, MUL VL]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla z30.s, p2/M, z18.s, z8.s\n"
+    "fmla z27.s, p2/M, z18.s, z13.s\n"
+    "ld1w { z24.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla z31.s, p2/M, z18.s, z11.s\n"
+    "fmla z26.s, p2/M, z18.s, z0.s\n"
+    "ld1w { z18.s }, p2/Z, [x9, #6, MUL VL]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z30.s, p2/M, z17.s, z13.s\n"
+    "fmla z27.s, p2/M, z17.s, z5.s\n"
+    "ld1w { z3.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "ldr x21, [x16, #0xc0]\n"
+    "fmla z31.s, p2/M, z17.s, z0.s\n"
+    "fmla z26.s, p2/M, z17.s, z29.s\n"
+    "ld1w { z17.s }, p2/Z, [x9, #7, MUL VL]\n"
+    "addvl x9, x9, #16\n"
+    "fmla z30.s, p2/M, z16.s, z5.s\n"
+    "fmla z27.s, p2/M, z16.s, z22.s\n"
+    "ld1w { z6.s }, p3/Z, [x23, x13, LSL #2]\n"
+    "ldr x27, [x16, #0xc8]\n"
+    "fmla z31.s, p2/M, z16.s, z29.s\n"
+    "fmla z26.s, p2/M, z16.s, z3.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, #-8, MUL VL]\n"
+    "ldr x23, [x16, #0xd0]\n"
+    "fmla z30.s, p2/M, z19.s, z22.s\n"
+    "fmla z27.s, p2/M, z19.s, z10.s\n"
+    "ld1w { z23.s }, p3/Z, [x26, x13, LSL #2]\n"
+    "ld1w { z22.s }, p3/Z, [x22, x13, LSL #2]\n"
+    "fmla z31.s, p2/M, z19.s, z3.s\n"
+    "fmla z26.s, p2/M, z19.s, z24.s\n"
+    "ld1w { z21.s }, p2/Z, [x9, #-7, MUL VL]\n"
+    "ldr x22, [x16, #0xd8]\n"
+    "fmla z30.s, p2/M, z25.s, z14.s\n"
+    "fmla z27.s, p2/M, z25.s, z11.s\n"
+    "ld1w { z1.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldr x20, [x16, #0xe0]\n"
+    "fmla z31.s, p2/M, z25.s, z6.s\n"
+    "fmla z26.s, p2/M, z25.s, z23.s\n"
+    "ld1w { z20.s }, p2/Z, [x9, #-6, MUL VL]\n"
+    "ldr x26, [x16, #0xf8]\n"
+    "fmla z30.s, p2/M, z18.s, z11.s\n"
+    "fmla z27.s, p2/M, z18.s, z0.s\n"
+    "ld1w { z7.s }, p3/Z, [x25, x13, LSL #2]\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla z31.s, p2/M, z18.s, z23.s\n"
+    "fmla z26.s, p2/M, z18.s, z22.s\n"
+    "ld1w { z18.s }, p2/Z, [x9, #-5, MUL VL]\n"
+    "whilelt p1.s, x10, %x[n_channels]\n"
+    "fmla z30.s, p2/M, z17.s, z0.s\n"
+    "fmla z27.s, p2/M, z17.s, z29.s\n"
+    "ld1w { z19.s }, p3/Z, [x24, x13, LSL #2]\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla z31.s, p2/M, z17.s, z22.s\n"
+    "fmla z26.s, p2/M, z17.s, z7.s\n"
+    "ld1w { z17.s }, p2/Z, [x9, #-4, MUL VL]\n"
+    "incw x28\n"
+    "fmla z30.s, p2/M, z16.s, z29.s\n"
+    "fmla z27.s, p2/M, z16.s, z3.s\n"
+    "ld1w { z0.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "ldr x21, [x16, #0x100]\n"
+    "fmla z31.s, p2/M, z16.s, z7.s\n"
+    "fmla z26.s, p2/M, z16.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, #-3, MUL VL]\n"
+    "mov p0.b, p3.b\n"
+    "fmla z30.s, p2/M, z21.s, z3.s\n"
+    "fmla z27.s, p2/M, z21.s, z24.s\n"
+    "ld1w { z11.s }, p3/Z, [x27, x13, LSL #2]\n"
+    "ld1w { z13.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "fmla z31.s, p2/M, z21.s, z19.s\n"
+    "fmla z26.s, p2/M, z21.s, z1.s\n"
+    "ld1w { z10.s }, p2/Z, [x9, #-2, MUL VL]\n"
+    "ldr x20, [x16, #0x108]\n"
+    "fmla z30.s, p2/M, z20.s, z6.s\n"
+    "fmla z27.s, p2/M, z20.s, z23.s\n"
+    "ld1w { z25.s }, p3/Z, [x23, x13, LSL #2]\n"
+    "ldr x23, [x16, #0x110]\n"
+    "fmla z31.s, p2/M, z20.s, z0.s\n"
+    "fmla z26.s, p2/M, z20.s, z11.s\n"
+    "ld1w { z8.s }, p2/Z, [x9, #-1, MUL VL]\n"
+    "ld1w { z29.s }, p2/Z, [x9, #4, MUL VL]\n"
+    "fmla z30.s, p2/M, z18.s, z23.s\n"
+    "fmla z27.s, p2/M, z18.s, z22.s\n"
+    "ld1w { z24.s }, p3/Z, [x22, x13, LSL #2]\n"
+    "ldr x22, [x16, #0x118]\n"
+    "fmla z31.s, p2/M, z18.s, z11.s\n"
+    "fmla z26.s, p2/M, z18.s, z25.s\n"
+    "ld1w { z23.s }, p2/Z, [x9]\n"
+    "fmla z30.s, p2/M, z17.s, z22.s\n"
+    "fmla z27.s, p2/M, z17.s, z7.s\n"
+    "ld1w { z22.s }, p3/Z, [x25, x13, LSL #2]\n"
+    "fmla z31.s, p2/M, z17.s, z25.s\n"
+    "fmla z26.s, p2/M, z17.s, z24.s\n"
+    "ld1w { z21.s }, p2/Z, [x9, #1, MUL VL]\n"
+    "fmla z30.s, p2/M, z16.s, z7.s\n"
+    "fmla z27.s, p2/M, z16.s, z19.s\n"
+    "ld1w { z18.s }, p3/Z, [x24, x13, LSL #2]\n"
+    "fmla z31.s, p2/M, z16.s, z24.s\n"
+    "fmla z26.s, p2/M, z16.s, z13.s\n"
+    "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+    "fmla z30.s, p2/M, z10.s, z19.s\n"
+    "fmla z27.s, p2/M, z10.s, z1.s\n"
+    "ld1w { z17.s }, p3/Z, [x26, x13, LSL #2]\n"
+    "fmla z31.s, p2/M, z10.s, z13.s\n"
+    "fmla z26.s, p2/M, z10.s, z22.s\n"
+    "ld1w { z19.s }, p2/Z, [x9, #3, MUL VL]\n"
+    "fmla z30.s, p2/M, z8.s, z0.s\n"
+    "fmla z27.s, p2/M, z8.s, z11.s\n"
+    "ld1w { z16.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "fmla z31.s, p2/M, z8.s, z18.s\n"
+    "fmla z26.s, p2/M, z8.s, z17.s\n"
+    "ld1w { z18.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldp x21, x20, [x16, #0x0]\n"
+    "fmla z30.s, p2/M, z23.s, z11.s\n"
+    "fmla z27.s, p2/M, z23.s, z25.s\n"
+    "ld1w { z0.s }, p2/Z, [x9, #5, MUL VL]\n"
+    "fmla z31.s, p2/M, z23.s, z17.s\n"
+    "fmla z26.s, p2/M, z23.s, z16.s\n"
+    "ld1w { z17.s }, p3/Z, [x23, x13, LSL #2]\n"
+    "ld1w { z1.s }, p2/Z, [x9, #6, MUL VL]\n"
+    "fmla z30.s, p2/M, z21.s, z25.s\n"
+    "fmla z27.s, p2/M, z21.s, z24.s\n"
+    "ld1w { z5.s }, p1/Z, [x21, x10, LSL #2]\n"
+    "fmla z31.s, p2/M, z21.s, z16.s\n"
+    "fmla z26.s, p2/M, z21.s, z18.s\n"
+    "ld1w { z16.s }, p3/Z, [x22, x13, LSL #2]\n"
+    "ldp x27, x26, [x16, #0x10]\n"
+    "fmla z30.s, p2/M, z20.s, z24.s\n"
+    "fmla z27.s, p2/M, z20.s, z13.s\n"
+    "ld1w { z6.s }, p1/Z, [x20, x10, LSL #2]\n"
+    "ldp x25, x24, [x16, #0x20]\n"
+    "fmla z31.s, p2/M, z20.s, z18.s\n"
+    "fmla z26.s, p2/M, z20.s, z17.s\n"
+    "ldp x23, x22, [x16, #0x30]\n"
+    "ldp x21, x20, [x16, #0x40]\n"
+    "fmla z30.s, p2/M, z19.s, z13.s\n"
+    "fmla z27.s, p2/M, z19.s, z22.s\n"
+    "incw x13\n"
+    "ld1w { z7.s }, p1/Z, [x27, x10, LSL #2]\n"
+    "fmla z31.s, p2/M, z19.s, z17.s\n"
+    "fmla z26.s, p2/M, z19.s, z16.s\n"
+    "ld1w { z8.s }, p1/Z, [x26, x10, LSL #2]\n"
+    "ld1w { z9.s }, p1/Z, [x25, x10, LSL #2]\n"
+    "ld1w { z13.s }, p1/Z, [x24, x10, LSL #2]\n"
+    "ld1w { z11.s }, p1/Z, [x23, x10, LSL #2]\n"
+    "fmax z30.s, p2/M, z30.s, z15.s\n"
+    "fmax z27.s, p2/M, z27.s, z15.s\n"
+    "ld1w { z12.s }, p1/Z, [x22, x10, LSL #2]\n"
+    "ld1w { z10.s }, p1/Z, [x21, x10, LSL #2]\n"
+    "fmax z31.s, p2/M, z31.s, z15.s\n"
+    "fmax z26.s, p2/M, z26.s, z15.s\n"
+    "ld1w { z14.s }, p1/Z, [x20, x10, LSL #2]\n"
+    "incw x10\n"
+    "ld1w { z2.s }, p2/Z, [x9, #7, MUL VL]\n"
+    "addvl x9, x9, #16\n"
+    "whilelt p3.s, x13, %x[n_channels]\n"
+    "cmp x10, %x[n_channels]\n"
+    "ld1w { z3.s }, p2/Z, [x9, #-8, MUL VL]\n"
+    "ld1w { z4.s }, p2/Z, [x9, #-7, MUL VL]\n"
+    "fmin z30.s, p2/M, z30.s, z28.s\n"
+    "fmin z27.s, p2/M, z27.s, z28.s\n"
+    "st1w { z30.s }, p0, [x15, x28, LSL #2]\n"
+    "fmin z31.s, p2/M, z31.s, z28.s\n"
+    "fmin z26.s, p2/M, z26.s, z28.s\n"
+    "st1w { z27.s }, p0, [x14, x28, LSL #2]\n"
+    "st1w { z31.s }, p0, [x12, x28, LSL #2]\n"
+    "addvl x9, x9, #-6\n"
+    "st1w { z26.s }, p0, [x11, x28, LSL #2]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z30, z29\n fmla z30.s, p2/M, z0.s, z5.s\n"
+    "movprfx z31, z29\n fmla z31.s, p2/M, z0.s, z6.s\n"
+    "ldr x20, [x16, #0x50]\n"
+    "ld1w { z22.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "movprfx z5, z29\n fmla z5.s, p2/M, z0.s, z7.s\n"
+    "fmla z29.s, p2/M, z0.s, z8.s\n"
+    "ldr x20, [x16, #0x58]\n"
+    "ldr x21, [x16, #0x60]\n"
+    "fmla z30.s, p2/M, z1.s, z6.s\n"
+    "fmla z31.s, p2/M, z1.s, z9.s\n"
+    "ld1w { z6.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldr x20, [x16, #0x68]\n"
+    "fmla z5.s, p2/M, z1.s, z8.s\n"
+    "fmla z29.s, p2/M, z1.s, z13.s\n"
+    "ld1w { z20.s }, p2/Z, [x9]\n"
+    "ldr x23, [x16, #0x70]\n"
+    "fmla z30.s, p2/M, z2.s, z9.s\n"
+    "fmla z31.s, p2/M, z2.s, z11.s\n"
+    "ld1w { z16.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "ld1w { z19.s }, p2/Z, [x9, #1, MUL VL]\n"
+    "fmla z5.s, p2/M, z2.s, z13.s\n"
+    "fmla z29.s, p2/M, z2.s, z22.s\n"
+    "ldr x21, [x16, #0x78]\n"
+    "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+    "fmla z30.s, p2/M, z3.s, z11.s\n"
+    "fmla z31.s, p2/M, z3.s, z12.s\n"
+    "ld1w { z1.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldr x22, [x16, #0x80]\n"
+    "fmla z5.s, p2/M, z3.s, z22.s\n"
+    "fmla z29.s, p2/M, z3.s, z6.s\n"
+    "ld1w { z17.s }, p2/Z, [x9, #3, MUL VL]\n"
+    "ldr x20, [x16, #0x88]\n"
+    "fmla z30.s, p2/M, z4.s, z12.s\n"
+    "fmla z31.s, p2/M, z4.s, z16.s\n"
+    "ld1w { z0.s }, p3/Z, [x23, x13, LSL #2]\n"
+    "ld1w { z27.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "fmla z5.s, p2/M, z4.s, z6.s\n"
+    "fmla z29.s, p2/M, z4.s, z10.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, #4, MUL VL]\n"
+    "ldr x21, [x16, #0x90]\n"
+    "fmla z30.s, p2/M, z20.s, z7.s\n"
+    "fmla z31.s, p2/M, z20.s, z8.s\n"
+    "ldr x27, [x16, #0x98]\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla z5.s, p2/M, z20.s, z14.s\n"
+    "fmla z29.s, p2/M, z20.s, z1.s\n"
+    "ld1w { z21.s }, p2/Z, [x9, #5, MUL VL]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla z30.s, p2/M, z19.s, z8.s\n"
+    "fmla z31.s, p2/M, z19.s, z13.s\n"
+    "ld1w { z26.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla z5.s, p2/M, z19.s, z1.s\n"
+    "fmla z29.s, p2/M, z19.s, z0.s\n"
+    "ld1w { z25.s }, p2/Z, [x9, #6, MUL VL]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z30.s, p2/M, z18.s, z13.s\n"
+    "fmla z31.s, p2/M, z18.s, z22.s\n"
+    "ld1w { z24.s }, p3/Z, [x22, x13, LSL #2]\n"
+    "ldr x23, [x16, #0xc0]\n"
+    "fmla z5.s, p2/M, z18.s, z0.s\n"
+    "fmla z29.s, p2/M, z18.s, z27.s\n"
+    "ld1w { z23.s }, p2/Z, [x9, #7, MUL VL]\n"
+    "addvl x9, x9, #16\n"
+    "fmla z30.s, p2/M, z17.s, z22.s\n"
+    "fmla z31.s, p2/M, z17.s, z6.s\n"
+    "ld1w { z22.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "ldr x22, [x16, #0xc8]\n"
+    "fmla z5.s, p2/M, z17.s, z27.s\n"
+    "fmla z29.s, p2/M, z17.s, z24.s\n"
+    "ld1w { z20.s }, p2/Z, [x9, #-8, MUL VL]\n"
+    "ldr x21, [x16, #0xd0]\n"
+    "fmla z30.s, p2/M, z16.s, z6.s\n"
+    "fmla z31.s, p2/M, z16.s, z10.s\n"
+    "ld1w { z19.s }, p3/Z, [x27, x13, LSL #2]\n"
+    "ld1w { z18.s }, p3/Z, [x26, x13, LSL #2]\n"
+    "fmla z5.s, p2/M, z16.s, z24.s\n"
+    "fmla z29.s, p2/M, z16.s, z26.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, #-7, MUL VL]\n"
+    "ldr x27, [x16, #0xd8]\n"
+    "fmla z30.s, p2/M, z21.s, z14.s\n"
+    "fmla z31.s, p2/M, z21.s, z1.s\n"
+    "ld1w { z17.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldr x20, [x16, #0xe0]\n"
+    "fmla z5.s, p2/M, z21.s, z22.s\n"
+    "fmla z29.s, p2/M, z21.s, z19.s\n"
+    "ld1w { z21.s }, p2/Z, [x9, #-6, MUL VL]\n"
+    "ldr x26, [x16, #0xf8]\n"
+    "fmla z30.s, p2/M, z25.s, z1.s\n"
+    "fmla z31.s, p2/M, z25.s, z0.s\n"
+    "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla z5.s, p2/M, z25.s, z19.s\n"
+    "fmla z29.s, p2/M, z25.s, z18.s\n"
+    "ld1w { z4.s }, p2/Z, [x9, #-5, MUL VL]\n"
+    "incw x28\n"
+    "fmla z30.s, p2/M, z23.s, z0.s\n"
+    "fmla z31.s, p2/M, z23.s, z27.s\n"
+    "ld1w { z8.s }, p3/Z, [x24, x13, LSL #2]\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla z5.s, p2/M, z23.s, z18.s\n"
+    "fmla z29.s, p2/M, z23.s, z9.s\n"
+    "ld1w { z6.s }, p2/Z, [x9, #-4, MUL VL]\n"
+    "mov p0.b, p3.b\n"
+    "fmla z30.s, p2/M, z20.s, z27.s\n"
+    "fmla z31.s, p2/M, z20.s, z24.s\n"
+    "ld1w { z10.s }, p3/Z, [x23, x13, LSL #2]\n"
+    "ldr x23, [x16, #0x100]\n"
+    "fmla z5.s, p2/M, z20.s, z9.s\n"
+    "fmla z29.s, p2/M, z20.s, z8.s\n"
+    "ld1w { z11.s }, p2/Z, [x9, #-3, MUL VL]\n"
+    "fmla z30.s, p2/M, z16.s, z24.s\n"
+    "fmla z31.s, p2/M, z16.s, z26.s\n"
+    "ld1w { z0.s }, p3/Z, [x22, x13, LSL #2]\n"
+    "ld1w { z27.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "fmla z5.s, p2/M, z16.s, z8.s\n"
+    "fmla z29.s, p2/M, z16.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, #-2, MUL VL]\n"
+    "ldr x22, [x16, #0x108]\n"
+    "fmla z30.s, p2/M, z21.s, z22.s\n"
+    "fmla z31.s, p2/M, z21.s, z19.s\n"
+    "ld1w { z26.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "ldr x21, [x16, #0x110]\n"
+    "fmla z5.s, p2/M, z21.s, z10.s\n"
+    "fmla z29.s, p2/M, z21.s, z0.s\n"
+    "ld1w { z25.s }, p2/Z, [x9, #-1, MUL VL]\n"
+    "fmla z30.s, p2/M, z4.s, z19.s\n"
+    "fmla z31.s, p2/M, z4.s, z18.s\n"
+    "ld1w { z24.s }, p3/Z, [x27, x13, LSL #2]\n"
+    "ldr x20, [x16, #0x118]\n"
+    "fmla z5.s, p2/M, z4.s, z0.s\n"
+    "fmla z29.s, p2/M, z4.s, z26.s\n"
+    "ld1w { z23.s }, p2/Z, [x9]\n"
+    "fmla z30.s, p2/M, z6.s, z18.s\n"
+    "fmla z31.s, p2/M, z6.s, z9.s\n"
+    "ld1w { z22.s }, p3/Z, [x25, x13, LSL #2]\n"
+    "fmla z5.s, p2/M, z6.s, z26.s\n"
+    "fmla z29.s, p2/M, z6.s, z24.s\n"
+    "ld1w { z21.s }, p2/Z, [x9, #1, MUL VL]\n"
+    "fmla z30.s, p2/M, z11.s, z9.s\n"
+    "fmla z31.s, p2/M, z11.s, z8.s\n"
+    "ld1w { z18.s }, p3/Z, [x24, x13, LSL #2]\n"
+    "fmla z5.s, p2/M, z11.s, z24.s\n"
+    "fmla z29.s, p2/M, z11.s, z27.s\n"
+    "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+    "fmla z30.s, p2/M, z16.s, z8.s\n"
+    "fmla z31.s, p2/M, z16.s, z17.s\n"
+    "ld1w { z17.s }, p3/Z, [x26, x13, LSL #2]\n"
+    "fmla z5.s, p2/M, z16.s, z27.s\n"
+    "fmla z29.s, p2/M, z16.s, z22.s\n"
+    "ld1w { z19.s }, p2/Z, [x9, #3, MUL VL]\n"
+    "fmla z30.s, p2/M, z25.s, z10.s\n"
+    "fmla z31.s, p2/M, z25.s, z0.s\n"
+    "ld1w { z16.s }, p3/Z, [x23, x13, LSL #2]\n"
+    "fmla z5.s, p2/M, z25.s, z18.s\n"
+    "fmla z29.s, p2/M, z25.s, z17.s\n"
+    "ld1w { z18.s }, p3/Z, [x22, x13, LSL #2]\n"
+    "fmla z30.s, p2/M, z23.s, z0.s\n"
+    "fmla z31.s, p2/M, z23.s, z26.s\n"
+    "fmla z5.s, p2/M, z23.s, z17.s\n"
+    "fmla z29.s, p2/M, z23.s, z16.s\n"
+    "ld1w { z17.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "fmla z30.s, p2/M, z21.s, z26.s\n"
+    "fmla z31.s, p2/M, z21.s, z24.s\n"
+    "fmla z5.s, p2/M, z21.s, z16.s\n"
+    "fmla z29.s, p2/M, z21.s, z18.s\n"
+    "ld1w { z16.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "fmla z30.s, p2/M, z20.s, z24.s\n"
+    "fmla z31.s, p2/M, z20.s, z27.s\n"
+    "fmla z5.s, p2/M, z20.s, z18.s\n"
+    "fmla z29.s, p2/M, z20.s, z17.s\n"
+    "fmla z30.s, p2/M, z19.s, z27.s\n"
+    "fmla z31.s, p2/M, z19.s, z22.s\n"
+    "fmax z30.s, p2/M, z30.s, z15.s\n"
+    "fmax z31.s, p2/M, z31.s, z15.s\n"
+    "fmla z5.s, p2/M, z19.s, z17.s\n"
+    "fmla z29.s, p2/M, z19.s, z16.s\n"
+    "fmax z5.s, p2/M, z5.s, z15.s\n"
+    "fmax z29.s, p2/M, z29.s, z15.s\n"
+    "fmin z30.s, p2/M, z30.s, z28.s\n"
+    "fmin z31.s, p2/M, z31.s, z28.s\n"
+    "st1w { z30.s }, p0, [x15, x28, LSL #2]\n"
+    "fmin z5.s, p2/M, z5.s, z28.s\n"
+    "fmin z29.s, p2/M, z29.s, z28.s\n"
+    "st1w { z31.s }, p0, [x14, x28, LSL #2]\n"
+    "st1w { z5.s }, p0, [x12, x28, LSL #2]\n"
+    "st1w { z29.s }, p0, [x11, x28, LSL #2]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6b155fc855
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
+
+class sve_fp32_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<float, float, float, float>
+{
+  KernelType kernel = sve_fp32_nhwc_generic_output9_mla_depthfirst_impl;
+
+  public:
+  sve_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<float, float, float, float>(9, arm_gemm::VLType::SVE) {}
+
+  KernelType get_kernel() const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d53daaa8a0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const void *bias,
+  const unsigned int n_points,
+  const unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ptrue p1.b\n"
+    "mov x11, #0x0\n"
+    "ld1rw { z2.s }, p1/Z, [%x[minmax_vals]]\n"
+    "ld1rw { z1.s }, p1/Z, [%x[minmax_vals], #4]\n"
+    "whilelt p0.s, x11, %x[n_channels]\n"
+    "1:"  // Channel loop
+    "mov z23.b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ld1w { z23.s }, p0/Z, [%x[bias], x11, LSL #2]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "mov x10, %x[inptrs]\n"
+    "ldp x28, x27, [x10], #0x10\n"
+    "ldp x26, x25, [x10], #0x10\n"
+    "subs x9, %x[n_points], #0x1\n"
+    "ldp x24, x23, [x10], #0x10\n"
+    "ldp x22, x21, [x10], #0x10\n"
+    "mov z24.d, z23.d\n"
+    "mov z25.d, z23.d\n"
+    "ldr x20, [x10], #0x8\n"
+    "mov z26.d, z23.d\n"
+    "mov z27.d, z23.d\n"
+    "ld1w { z0.s }, p1/Z, [%x[params]]\n"
+    "mov z28.d, z23.d\n"
+    "mov z29.d, z23.d\n"
+    "ld1w { z14.s }, p0/Z, [x28, x11, LSL #2]\n"
+    "ld1w { z15.s }, p0/Z, [x27, x11, LSL #2]\n"
+    "mov z30.d, z23.d\n"
+    "mov z31.d, z23.d\n"
+    "ld1w { z16.s }, p0/Z, [x26, x11, LSL #2]\n"
+    "ld1w { z17.s }, p0/Z, [x25, x11, LSL #2]\n"
+    "ld1w { z18.s }, p0/Z, [x24, x11, LSL #2]\n"
+    "ld1w { z19.s }, p0/Z, [x23, x11, LSL #2]\n"
+    "addvl %x[params], %x[params], #1\n"
+    "ld1w { z20.s }, p0/Z, [x22, x11, LSL #2]\n"
+    "ld1w { z21.s }, p0/Z, [x21, x11, LSL #2]\n"
+    "ld1w { z22.s }, p0/Z, [x20, x11, LSL #2]\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "ldp x28, x27, [x10], #0x10\n"
+    "ldp x26, x25, [x10], #0x10\n"
+    "subs x9, x9, #0x1\n"
+    "fmla z23.s, p1/M, z14.s, z0.s\n"
+    "ldp x24, x23, [x10], #0x10\n"
+    "ldp x22, x21, [x10], #0x10\n"
+    "fmla z24.s, p1/M, z15.s, z0.s\n"
+    "fmla z25.s, p1/M, z16.s, z0.s\n"
+    "ldr x20, [x10], #0x8\n"
+    "fmla z26.s, p1/M, z17.s, z0.s\n"
+    "fmla z27.s, p1/M, z18.s, z0.s\n"
+    "ld1w { z14.s }, p0/Z, [x28, x11, LSL #2]\n"
+    "fmla z28.s, p1/M, z19.s, z0.s\n"
+    "fmla z29.s, p1/M, z20.s, z0.s\n"
+    "ld1w { z15.s }, p0/Z, [x27, x11, LSL #2]\n"
+    "ld1w { z16.s }, p0/Z, [x26, x11, LSL #2]\n"
+    "fmla z30.s, p1/M, z21.s, z0.s\n"
+    "fmla z31.s, p1/M, z22.s, z0.s\n"
+    "ld1w { z0.s }, p1/Z, [%x[params]]\n"
+    "ld1w { z17.s }, p0/Z, [x25, x11, LSL #2]\n"
+    "ld1w { z18.s }, p0/Z, [x24, x11, LSL #2]\n"
+    "ld1w { z19.s }, p0/Z, [x23, x11, LSL #2]\n"
+    "addvl %x[params], %x[params], #1\n"
+    "ld1w { z20.s }, p0/Z, [x22, x11, LSL #2]\n"
+    "ld1w { z21.s }, p0/Z, [x21, x11, LSL #2]\n"
+    "ld1w { z22.s }, p0/Z, [x20, x11, LSL #2]\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "fmla z23.s, p1/M, z14.s, z0.s\n"
+    "fmla z24.s, p1/M, z15.s, z0.s\n"
+    "fmax z23.s, p1/M, z23.s, z2.s\n"
+    "fmax z24.s, p1/M, z24.s, z2.s\n"
+    "fmla z25.s, p1/M, z16.s, z0.s\n"
+    "fmla z26.s, p1/M, z17.s, z0.s\n"
+    "fmax z25.s, p1/M, z25.s, z2.s\n"
+    "fmax z26.s, p1/M, z26.s, z2.s\n"
+    "fmla z27.s, p1/M, z18.s, z0.s\n"
+    "fmla z28.s, p1/M, z19.s, z0.s\n"
+    "fmax z27.s, p1/M, z27.s, z2.s\n"
+    "fmax z28.s, p1/M, z28.s, z2.s\n"
+    "fmla z29.s, p1/M, z20.s, z0.s\n"
+    "fmla z30.s, p1/M, z21.s, z0.s\n"
+    "fmax z29.s, p1/M, z29.s, z2.s\n"
+    "fmax z30.s, p1/M, z30.s, z2.s\n"
+    "fmla z31.s, p1/M, z22.s, z0.s\n"
+    "fmax z31.s, p1/M, z31.s, z2.s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "fmin z23.s, p1/M, z23.s, z1.s\n"
+    "fmin z24.s, p1/M, z24.s, z1.s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "fmin z25.s, p1/M, z25.s, z1.s\n"
+    "fmin z26.s, p1/M, z26.s, z1.s\n"
+    "st1w { z23.s }, p0, [x28, x11, LSL #2]\n"
+    "fmin z27.s, p1/M, z27.s, z1.s\n"
+    "fmin z28.s, p1/M, z28.s, z1.s\n"
+    "st1w { z24.s }, p0, [x27, x11, LSL #2]\n"
+    "fmin z29.s, p1/M, z29.s, z1.s\n"
+    "fmin z30.s, p1/M, z30.s, z1.s\n"
+    "st1w { z25.s }, p0, [x26, x11, LSL #2]\n"
+    "fmin z31.s, p1/M, z31.s, z1.s\n"
+    "st1w { z26.s }, p0, [x25, x11, LSL #2]\n"
+    "st1w { z27.s }, p0, [x24, x11, LSL #2]\n"
+    "st1w { z28.s }, p0, [x23, x11, LSL #2]\n"
+    "st1w { z29.s }, p0, [x22, x11, LSL #2]\n"
+    "st1w { z30.s }, p0, [x21, x11, LSL #2]\n"
+    "st1w { z31.s }, p0, [x20, x11, LSL #2]\n"
+    "incw x11\n"
+    "whilelt p0.s, x11, %x[n_channels]\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
+    : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..eb1b111c36
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst : DepthfirstMultiplierStrategy<float, float, float, float>
+{
+  using Parent = DepthfirstMultiplierStrategy<float, float, float, float>;
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *)
+  : Parent(3, 3, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..3a71baaf61
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "mov x17, #0x0\n"
+    "whilelt p2.s, x17, %x[channel_multiplier]\n"
+    "ldr x16, [%x[inptrs], #0x0]\n"
+    "ldr x15, [%x[inptrs], #0x8]\n"
+    "ptrue p1.b\n"
+    "ldr x14, [%x[inptrs], #0x10]\n"
+    "ldr x13, [%x[inptrs], #0x18]\n"
+    "mov x12, #0x0\n"
+    "ldr x11, [%x[inptrs], #0x20]\n"
+    "ldr x10, [%x[inptrs], #0x28]\n"
+    "ldr x9, [%x[inptrs], #0x30]\n"
+    "ld1w { z24.s }, p2/Z, [%x[params]]\n"
+    "mov z21.d, z24.d\n"
+    "mov z25.d, z24.d\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "mov z27.d, z24.d\n"
+    "mov z26.d, z24.d\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "mov z28.d, z24.d\n"
+    "mov z20.d, z24.d\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "ld1rqw { z2.s }, p1/Z, [x16]\n"
+    "mov z23.d, z24.d\n"
+    "mov z19.d, z24.d\n"
+    "ld1rqw { z3.s }, p1/Z, [x16, #16]\n"
+    "ld1rqw { z4.s }, p1/Z, [x15]\n"
+    "ld1rqw { z5.s }, p1/Z, [x15, #16]\n"
+    "ld1rqw { z6.s }, p1/Z, [x14]\n"
+    "ld1rqw { z7.s }, p1/Z, [x14, #16]\n"
+    "ld1rqw { z8.s }, p1/Z, [x13]\n"
+    "ld1rqw { z9.s }, p1/Z, [x13, #16]\n"
+    "ld1rqw { z10.s }, p1/Z, [x11]\n"
+    "ld1rqw { z11.s }, p1/Z, [x11, #16]\n"
+    "ld1rqw { z12.s }, p1/Z, [x10]\n"
+    "ld1rqw { z13.s }, p1/Z, [x10, #16]\n"
+    "ld1rqw { z14.s }, p1/Z, [x9]\n"
+    "ld1rqw { z15.s }, p1/Z, [x9, #16]\n"
+    "ld1rw { z22.s }, p1/Z, [%x[clamps]]\n"
+    "ld1rw { z16.s }, p1/Z, [%x[clamps], #4]\n"
+    "ld1w { z31.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "ld1w { z29.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "addvl %x[params], %x[params], #4\n"
+    "1:"  // Output channel complete vector loop
+    "fmla z24.s, z31.s, z2.s[0]\n"
+    "fmla z27.s, z31.s, z6.s[0]\n"
+    "mov z1.d, z10.d\n"
+    "incw x17\n"
+    "fmla z26.s, z31.s, z6.s[2]\n"
+    "fmla z28.s, z31.s, z7.s[0]\n"
+    "mov z0.d, z11.d\n"
+    "mov p0.b, p2.b\n"
+    "fmla z21.s, z31.s, z2.s[2]\n"
+    "fmla z25.s, z31.s, z3.s[0]\n"
+    "whilelt p2.s, x17, %x[channel_multiplier]\n"
+    "fmla z20.s, z31.s, z1.s[0]\n"
+    "fmla z23.s, z31.s, z1.s[2]\n"
+    "fmla z19.s, z31.s, z0.s[0]\n"
+    "fmla z24.s, z30.s, z2.s[1]\n"
+    "ld1w { z18.s }, p1/Z, [%x[params]]\n"
+    "fmla z27.s, z30.s, z6.s[1]\n"
+    "fmla z26.s, z30.s, z6.s[3]\n"
+    "fmla z28.s, z30.s, z7.s[1]\n"
+    "fmla z21.s, z30.s, z2.s[3]\n"
+    "fmla z25.s, z30.s, z3.s[1]\n"
+    "fmla z20.s, z30.s, z1.s[1]\n"
+    "fmla z23.s, z30.s, z1.s[3]\n"
+    "fmla z19.s, z30.s, z0.s[1]\n"
+    "ld1w { z17.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+    "fmla z24.s, z29.s, z2.s[2]\n"
+    "fmla z27.s, z29.s, z6.s[2]\n"
+    "fmla z26.s, z29.s, z7.s[0]\n"
+    "fmla z28.s, z29.s, z7.s[2]\n"
+    "fmla z21.s, z29.s, z3.s[0]\n"
+    "fmla z25.s, z29.s, z3.s[2]\n"
+    "fmla z20.s, z29.s, z1.s[2]\n"
+    "fmla z23.s, z29.s, z0.s[0]\n"
+    "mov z1.d, z8.d\n"
+    "fmla z19.s, z29.s, z0.s[2]\n"
+    "mov z0.d, z9.d\n"
+    "fmla z24.s, z18.s, z4.s[0]\n"
+    "ld1w { z31.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+    "fmla z27.s, z18.s, z1.s[0]\n"
+    "fmla z26.s, z18.s, z1.s[2]\n"
+    "mov z1.d, z12.d\n"
+    "fmla z28.s, z18.s, z0.s[0]\n"
+    "mov z0.d, z13.d\n"
+    "fmla z21.s, z18.s, z4.s[2]\n"
+    "fmla z25.s, z18.s, z5.s[0]\n"
+    "fmla z20.s, z18.s, z1.s[0]\n"
+    "fmla z23.s, z18.s, z1.s[2]\n"
+    "fmla z19.s, z18.s, z0.s[0]\n"
+    "mov z1.d, z8.d\n"
+    "ld1w { z18.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+    "mov z0.d, z9.d\n"
+    "fmla z24.s, z17.s, z4.s[1]\n"
+    "fmla z27.s, z17.s, z1.s[1]\n"
+    "fmla z26.s, z17.s, z1.s[3]\n"
+    "fmla z28.s, z17.s, z0.s[1]\n"
+    "mov z1.d, z12.d\n"
+    "mov z0.d, z13.d\n"
+    "fmla z21.s, z17.s, z4.s[3]\n"
+    "fmla z25.s, z17.s, z5.s[1]\n"
+    "fmla z20.s, z17.s, z1.s[1]\n"
+    "fmla z23.s, z17.s, z1.s[3]\n"
+    "mov z1.d, z8.d\n"
+    "fmla z19.s, z17.s, z0.s[1]\n"
+    "mov z0.d, z9.d\n"
+    "fmla z24.s, z31.s, z4.s[2]\n"
+    "ld1w { z17.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+    "fmla z27.s, z31.s, z1.s[2]\n"
+    "fmla z26.s, z31.s, z0.s[0]\n"
+    "mov z1.d, z12.d\n"
+    "fmla z28.s, z31.s, z0.s[2]\n"
+    "mov z0.d, z13.d\n"
+    "fmla z21.s, z31.s, z5.s[0]\n"
+    "fmla z25.s, z31.s, z5.s[2]\n"
+    "fmla z20.s, z31.s, z1.s[2]\n"
+    "mov z1.d, z10.d\n"
+    "fmla z23.s, z31.s, z0.s[0]\n"
+    "fmla z19.s, z31.s, z0.s[2]\n"
+    "mov z0.d, z11.d\n"
+    "ld1w { z29.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+    "fmla z24.s, z18.s, z6.s[0]\n"
+    "fmla z27.s, z18.s, z1.s[0]\n"
+    "fmla z26.s, z18.s, z1.s[2]\n"
+    "fmla z28.s, z18.s, z0.s[0]\n"
+    "mov z1.d, z14.d\n"
+    "mov z0.d, z15.d\n"
+    "fmla z21.s, z18.s, z6.s[2]\n"
+    "fmla z25.s, z18.s, z7.s[0]\n"
+    "fmla z20.s, z18.s, z1.s[0]\n"
+    "fmla z23.s, z18.s, z1.s[2]\n"
+    "mov z1.d, z10.d\n"
+    "fmla z19.s, z18.s, z0.s[0]\n"
+    "mov z0.d, z11.d\n"
+    "fmla z24.s, z17.s, z6.s[1]\n"
+    "ld1w { z31.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "fmla z27.s, z17.s, z1.s[1]\n"
+    "fmla z26.s, z17.s, z1.s[3]\n"
+    "mov z1.d, z14.d\n"
+    "fmla z28.s, z17.s, z0.s[1]\n"
+    "mov z0.d, z15.d\n"
+    "fmla z21.s, z17.s, z6.s[3]\n"
+    "fmla z25.s, z17.s, z7.s[1]\n"
+    "fmla z20.s, z17.s, z1.s[1]\n"
+    "fmla z23.s, z17.s, z1.s[3]\n"
+    "fmla z19.s, z17.s, z0.s[1]\n"
+    "mov z1.d, z10.d\n"
+    "mov z0.d, z11.d\n"
+    "fmla z24.s, z29.s, z6.s[2]\n"
+    "fmla z27.s, z29.s, z1.s[2]\n"
+    "fmin z24.s, p1/M, z24.s, z16.s\n"
+    "fmla z26.s, z29.s, z0.s[0]\n"
+    "fmla z28.s, z29.s, z0.s[2]\n"
+    "mov z1.d, z14.d\n"
+    "fmax z24.s, p1/M, z24.s, z22.s\n"
+    "mov z0.d, z15.d\n"
+    "fmla z21.s, z29.s, z7.s[0]\n"
+    "fmla z25.s, z29.s, z7.s[2]\n"
+    "fmin z21.s, p1/M, z21.s, z16.s\n"
+    "fmla z20.s, z29.s, z1.s[2]\n"
+    "fmla z23.s, z29.s, z0.s[0]\n"
+    "fmin z25.s, p1/M, z25.s, z16.s\n"
+    "fmin z27.s, p1/M, z27.s, z16.s\n"
+    "fmla z19.s, z29.s, z0.s[2]\n"
+    "fmin z26.s, p1/M, z26.s, z16.s\n"
+    "fmin z28.s, p1/M, z28.s, z16.s\n"
+    "st1w { z24.s }, p0, [x28, x12, LSL #2]\n"
+    "fmin z20.s, p1/M, z20.s, z16.s\n"
+    "fmin z23.s, p1/M, z23.s, z16.s\n"
+    "ld1w { z24.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "fmin z19.s, p1/M, z19.s, z16.s\n"
+    "addvl %x[params], %x[params], #16\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "ld1w { z29.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "fmax z21.s, p1/M, z21.s, z22.s\n"
+    "fmax z25.s, p1/M, z25.s, z22.s\n"
+    "st1w { z21.s }, p0, [x27, x12, LSL #2]\n"
+    "mov z21.d, z24.d\n"
+    "fmax z27.s, p1/M, z27.s, z22.s\n"
+    "fmax z26.s, p1/M, z26.s, z22.s\n"
+    "st1w { z25.s }, p0, [x26, x12, LSL #2]\n"
+    "mov z25.d, z24.d\n"
+    "fmax z28.s, p1/M, z28.s, z22.s\n"
+    "fmax z20.s, p1/M, z20.s, z22.s\n"
+    "st1w { z27.s }, p0, [x25, x12, LSL #2]\n"
+    "mov z27.d, z24.d\n"
+    "fmax z23.s, p1/M, z23.s, z22.s\n"
+    "fmax z19.s, p1/M, z19.s, z22.s\n"
+    "st1w { z26.s }, p0, [x24, x12, LSL #2]\n"
+    "mov z26.d, z24.d\n"
+    "st1w { z28.s }, p0, [x23, x12, LSL #2]\n"
+    "mov z28.d, z24.d\n"
+    "addvl %x[params], %x[params], #-6\n"
+    "st1w { z20.s }, p0, [x22, x12, LSL #2]\n"
+    "mov z20.d, z24.d\n"
+    "st1w { z23.s }, p0, [x21, x12, LSL #2]\n"
+    "mov z23.d, z24.d\n"
+    "st1w { z19.s }, p0, [x20, x12, LSL #2]\n"
+    "incw x12\n"
+    "mov z19.d, z24.d\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..cc0c4236a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst : DepthfirstMultiplierStrategy<float, float, float, float>
+{
+  using Parent = DepthfirstMultiplierStrategy<float, float, float, float>;
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *)
+  : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..84ab4b5035
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "mov x15, #0x0\n"
+    "whilelt p2.s, x15, %x[channel_multiplier]\n"
+    "ldr x14, [%x[inptrs], #0x0]\n"
+    "ldr x13, [%x[inptrs], #0x8]\n"
+    "ptrue p1.b\n"
+    "ldr x12, [%x[inptrs], #0x10]\n"
+    "ldr x11, [%x[inptrs], #0x18]\n"
+    "mov x10, #0x0\n"
+    "ldr x9, [%x[inptrs], #0x20]\n"
+    "ldr x28, [%x[inptrs], #0x28]\n"
+    "ld1w { z16.s }, p2/Z, [%x[params]]\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "mov z25.d, z16.d\n"
+    "mov z15.d, z16.d\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "mov z24.d, z16.d\n"
+    "mov z14.d, z16.d\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "ld1rqw { z2.s }, p1/Z, [x14]\n"
+    "mov z26.d, z16.d\n"
+    "mov z17.d, z16.d\n"
+    "ld1rqw { z3.s }, p1/Z, [x14, #16]\n"
+    "ld1rqw { z4.s }, p1/Z, [x13]\n"
+    "mov z23.d, z16.d\n"
+    "ld1rqw { z5.s }, p1/Z, [x13, #16]\n"
+    "ld1rqw { z6.s }, p1/Z, [x12]\n"
+    "ld1rqw { z7.s }, p1/Z, [x12, #16]\n"
+    "ld1rqw { z8.s }, p1/Z, [x11]\n"
+    "ld1rqw { z9.s }, p1/Z, [x11, #16]\n"
+    "ld1rqw { z10.s }, p1/Z, [x9]\n"
+    "ld1rqw { z11.s }, p1/Z, [x9, #16]\n"
+    "ld1rqw { z12.s }, p1/Z, [x28]\n"
+    "ld1rqw { z13.s }, p1/Z, [x28, #16]\n"
+    "ld1rw { z21.s }, p1/Z, [%x[clamps]]\n"
+    "ld1rw { z22.s }, p1/Z, [%x[clamps], #4]\n"
+    "ld1w { z31.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "ld1w { z29.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "ld1w { z28.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "ld1w { z27.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "addvl %x[params], %x[params], #6\n"
+    "1:"  // Output channel complete vector loop
+    "fmla z16.s, z31.s, z2.s[0]\n"
+    "fmla z25.s, z31.s, z2.s[1]\n"
+    "mov z0.d, z8.d\n"
+    "incw x15\n"
+    "fmla z15.s, z31.s, z2.s[2]\n"
+    "fmla z24.s, z31.s, z2.s[3]\n"
+    "mov z1.d, z9.d\n"
+    "mov p0.b, p2.b\n"
+    "fmla z14.s, z31.s, z4.s[0]\n"
+    "fmla z26.s, z31.s, z4.s[1]\n"
+    "whilelt p2.s, x15, %x[channel_multiplier]\n"
+    "fmla z17.s, z31.s, z4.s[2]\n"
+    "fmla z23.s, z31.s, z4.s[3]\n"
+    "ld1w { z20.s }, p1/Z, [%x[params]]\n"
+    "fmla z16.s, z30.s, z2.s[1]\n"
+    "fmla z25.s, z30.s, z2.s[2]\n"
+    "fmla z15.s, z30.s, z2.s[3]\n"
+    "fmla z24.s, z30.s, z3.s[0]\n"
+    "fmla z14.s, z30.s, z4.s[1]\n"
+    "fmla z26.s, z30.s, z4.s[2]\n"
+    "fmla z17.s, z30.s, z4.s[3]\n"
+    "fmla z23.s, z30.s, z5.s[0]\n"
+    "ld1w { z19.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+    "fmla z16.s, z29.s, z2.s[2]\n"
+    "fmla z25.s, z29.s, z2.s[3]\n"
+    "fmla z15.s, z29.s, z3.s[0]\n"
+    "fmla z24.s, z29.s, z3.s[1]\n"
+    "fmla z14.s, z29.s, z4.s[2]\n"
+    "fmla z26.s, z29.s, z4.s[3]\n"
+    "fmla z17.s, z29.s, z5.s[0]\n"
+    "fmla z23.s, z29.s, z5.s[1]\n"
+    "ld1w { z18.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+    "fmla z16.s, z28.s, z2.s[3]\n"
+    "fmla z25.s, z28.s, z3.s[0]\n"
+    "fmla z15.s, z28.s, z3.s[1]\n"
+    "fmla z24.s, z28.s, z3.s[2]\n"
+    "fmla z14.s, z28.s, z4.s[3]\n"
+    "fmla z26.s, z28.s, z5.s[0]\n"
+    "fmla z17.s, z28.s, z5.s[1]\n"
+    "fmla z23.s, z28.s, z5.s[2]\n"
+    "ld1w { z28.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+    "fmla z16.s, z27.s, z3.s[0]\n"
+    "fmla z25.s, z27.s, z3.s[1]\n"
+    "fmla z15.s, z27.s, z3.s[2]\n"
+    "fmla z24.s, z27.s, z3.s[3]\n"
+    "fmla z14.s, z27.s, z5.s[0]\n"
+    "fmla z26.s, z27.s, z5.s[1]\n"
+    "fmla z17.s, z27.s, z5.s[2]\n"
+    "fmla z23.s, z27.s, z5.s[3]\n"
+    "ld1w { z27.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+    "fmla z16.s, z20.s, z4.s[0]\n"
+    "fmla z25.s, z20.s, z4.s[1]\n"
+    "fmla z15.s, z20.s, z4.s[2]\n"
+    "fmla z24.s, z20.s, z4.s[3]\n"
+    "fmla z14.s, z20.s, z6.s[0]\n"
+    "fmla z26.s, z20.s, z6.s[1]\n"
+    "fmla z17.s, z20.s, z6.s[2]\n"
+    "fmla z23.s, z20.s, z6.s[3]\n"
+    "ld1w { z20.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+    "fmla z16.s, z19.s, z4.s[1]\n"
+    "fmla z25.s, z19.s, z4.s[2]\n"
+    "fmla z15.s, z19.s, z4.s[3]\n"
+    "fmla z24.s, z19.s, z5.s[0]\n"
+    "fmla z14.s, z19.s, z6.s[1]\n"
+    "fmla z26.s, z19.s, z6.s[2]\n"
+    "fmla z17.s, z19.s, z6.s[3]\n"
+    "fmla z23.s, z19.s, z7.s[0]\n"
+    "ld1w { z19.s }, p1/Z, [%x[params], #6, MUL VL]\n"
+    "fmla z16.s, z18.s, z4.s[2]\n"
+    "fmla z25.s, z18.s, z4.s[3]\n"
+    "fmla z15.s, z18.s, z5.s[0]\n"
+    "fmla z24.s, z18.s, z5.s[1]\n"
+    "fmla z14.s, z18.s, z6.s[2]\n"
+    "fmla z26.s, z18.s, z6.s[3]\n"
+    "fmla z17.s, z18.s, z7.s[0]\n"
+    "fmla z23.s, z18.s, z7.s[1]\n"
+    "ld1w { z18.s }, p1/Z, [%x[params], #7, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "fmla z16.s, z28.s, z4.s[3]\n"
+    "fmla z25.s, z28.s, z5.s[0]\n"
+    "fmla z15.s, z28.s, z5.s[1]\n"
+    "fmla z24.s, z28.s, z5.s[2]\n"
+    "fmla z14.s, z28.s, z6.s[3]\n"
+    "fmla z26.s, z28.s, z7.s[0]\n"
+    "fmla z17.s, z28.s, z7.s[1]\n"
+    "fmla z23.s, z28.s, z7.s[2]\n"
+    "ld1w { z30.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+    "fmla z16.s, z27.s, z5.s[0]\n"
+    "fmla z25.s, z27.s, z5.s[1]\n"
+    "fmla z15.s, z27.s, z5.s[2]\n"
+    "fmla z24.s, z27.s, z5.s[3]\n"
+    "fmla z14.s, z27.s, z7.s[0]\n"
+    "fmla z26.s, z27.s, z7.s[1]\n"
+    "fmla z17.s, z27.s, z7.s[2]\n"
+    "fmla z23.s, z27.s, z7.s[3]\n"
+    "ld1w { z27.s }, p1/Z, [%x[params], #-7, MUL VL]\n"
+    "fmla z16.s, z20.s, z6.s[0]\n"
+    "fmla z25.s, z20.s, z6.s[1]\n"
+    "fmla z15.s, z20.s, z6.s[2]\n"
+    "fmla z24.s, z20.s, z6.s[3]\n"
+    "fmla z14.s, z20.s, z0.s[0]\n"
+    "fmla z26.s, z20.s, z0.s[1]\n"
+    "fmla z17.s, z20.s, z0.s[2]\n"
+    "fmla z23.s, z20.s, z0.s[3]\n"
+    "ld1w { z20.s }, p1/Z, [%x[params], #-6, MUL VL]\n"
+    "fmla z16.s, z19.s, z6.s[1]\n"
+    "fmla z25.s, z19.s, z6.s[2]\n"
+    "fmla z15.s, z19.s, z6.s[3]\n"
+    "fmla z24.s, z19.s, z7.s[0]\n"
+    "fmla z14.s, z19.s, z0.s[1]\n"
+    "fmla z26.s, z19.s, z0.s[2]\n"
+    "fmla z17.s, z19.s, z0.s[3]\n"
+    "fmla z23.s, z19.s, z1.s[0]\n"
+    "ld1w { z19.s }, p1/Z, [%x[params], #-5, MUL VL]\n"
+    "fmla z16.s, z18.s, z6.s[2]\n"
+    "fmla z25.s, z18.s, z6.s[3]\n"
+    "fmla z15.s, z18.s, z7.s[0]\n"
+    "fmla z24.s, z18.s, z7.s[1]\n"
+    "fmla z14.s, z18.s, z0.s[2]\n"
+    "fmla z26.s, z18.s, z0.s[3]\n"
+    "fmla z17.s, z18.s, z1.s[0]\n"
+    "fmla z23.s, z18.s, z1.s[1]\n"
+    "ld1w { z18.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
+    "fmla z16.s, z30.s, z6.s[3]\n"
+    "fmla z25.s, z30.s, z7.s[0]\n"
+    "fmla z15.s, z30.s, z7.s[1]\n"
+    "fmla z24.s, z30.s, z7.s[2]\n"
+    "fmla z14.s, z30.s, z0.s[3]\n"
+    "fmla z26.s, z30.s, z1.s[0]\n"
+    "fmla z17.s, z30.s, z1.s[1]\n"
+    "fmla z23.s, z30.s, z1.s[2]\n"
+    "ld1w { z31.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
+    "fmla z16.s, z27.s, z7.s[0]\n"
+    "fmla z25.s, z27.s, z7.s[1]\n"
+    "fmla z15.s, z27.s, z7.s[2]\n"
+    "fmla z24.s, z27.s, z7.s[3]\n"
+    "fmla z14.s, z27.s, z1.s[0]\n"
+    "fmla z26.s, z27.s, z1.s[1]\n"
+    "fmla z17.s, z27.s, z1.s[2]\n"
+    "fmla z23.s, z27.s, z1.s[3]\n"
+    "ld1w { z27.s }, p1/Z, [%x[params], #-2, MUL VL]\n"
+    "fmla z16.s, z20.s, z0.s[0]\n"
+    "fmla z25.s, z20.s, z0.s[1]\n"
+    "fmla z15.s, z20.s, z0.s[2]\n"
+    "fmla z24.s, z20.s, z0.s[3]\n"
+    "mov z0.d, z10.d\n"
+    "fmla z14.s, z20.s, z0.s[0]\n"
+    "fmla z26.s, z20.s, z0.s[1]\n"
+    "fmla z17.s, z20.s, z0.s[2]\n"
+    "fmla z23.s, z20.s, z0.s[3]\n"
+    "mov z0.d, z8.d\n"
+    "ld1w { z20.s }, p1/Z, [%x[params], #-1, MUL VL]\n"
+    "fmla z16.s, z19.s, z0.s[1]\n"
+    "fmla z25.s, z19.s, z0.s[2]\n"
+    "fmla z15.s, z19.s, z0.s[3]\n"
+    "fmla z24.s, z19.s, z1.s[0]\n"
+    "mov z1.d, z10.d\n"
+    "mov z0.d, z11.d\n"
+    "fmla z14.s, z19.s, z1.s[1]\n"
+    "fmla z26.s, z19.s, z1.s[2]\n"
+    "fmla z17.s, z19.s, z1.s[3]\n"
+    "fmla z23.s, z19.s, z0.s[0]\n"
+    "mov z1.d, z8.d\n"
+    "ld1w { z19.s }, p1/Z, [%x[params]]\n"
+    "mov z0.d, z9.d\n"
+    "fmla z16.s, z18.s, z1.s[2]\n"
+    "fmla z25.s, z18.s, z1.s[3]\n"
+    "fmla z15.s, z18.s, z0.s[0]\n"
+    "fmla z24.s, z18.s, z0.s[1]\n"
+    "mov z1.d, z10.d\n"
+    "mov z0.d, z11.d\n"
+    "fmla z14.s, z18.s, z1.s[2]\n"
+    "fmla z26.s, z18.s, z1.s[3]\n"
+    "fmla z17.s, z18.s, z0.s[0]\n"
+    "fmla z23.s, z18.s, z0.s[1]\n"
+    "mov z1.d, z8.d\n"
+    "ld1w { z18.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+    "mov z0.d, z9.d\n"
+    "fmla z16.s, z31.s, z1.s[3]\n"
+    "fmla z25.s, z31.s, z0.s[0]\n"
+    "fmla z15.s, z31.s, z0.s[1]\n"
+    "fmla z24.s, z31.s, z0.s[2]\n"
+    "mov z0.d, z10.d\n"
+    "mov z1.d, z11.d\n"
+    "fmla z14.s, z31.s, z0.s[3]\n"
+    "fmla z26.s, z31.s, z1.s[0]\n"
+    "fmla z17.s, z31.s, z1.s[1]\n"
+    "fmla z23.s, z31.s, z1.s[2]\n"
+    "mov z1.d, z9.d\n"
+    "ld1w { z28.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+    "fmla z16.s, z27.s, z1.s[0]\n"
+    "fmla z25.s, z27.s, z1.s[1]\n"
+    "fmla z15.s, z27.s, z1.s[2]\n"
+    "fmla z24.s, z27.s, z1.s[3]\n"
+    "mov z1.d, z11.d\n"
+    "fmla z14.s, z27.s, z1.s[0]\n"
+    "fmla z26.s, z27.s, z1.s[1]\n"
+    "fmla z17.s, z27.s, z1.s[2]\n"
+    "fmla z23.s, z27.s, z1.s[3]\n"
+    "ld1w { z27.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+    "fmla z16.s, z20.s, z0.s[0]\n"
+    "fmla z25.s, z20.s, z0.s[1]\n"
+    "fmla z15.s, z20.s, z0.s[2]\n"
+    "fmla z24.s, z20.s, z0.s[3]\n"
+    "mov z0.d, z12.d\n"
+    "fmla z14.s, z20.s, z0.s[0]\n"
+    "fmla z26.s, z20.s, z0.s[1]\n"
+    "fmla z17.s, z20.s, z0.s[2]\n"
+    "fmla z23.s, z20.s, z0.s[3]\n"
+    "mov z0.d, z10.d\n"
+    "ld1w { z31.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "fmla z16.s, z19.s, z0.s[1]\n"
+    "fmla z25.s, z19.s, z0.s[2]\n"
+    "fmla z15.s, z19.s, z0.s[3]\n"
+    "fmla z24.s, z19.s, z1.s[0]\n"
+    "mov z1.d, z12.d\n"
+    "mov z0.d, z13.d\n"
+    "fmla z14.s, z19.s, z1.s[1]\n"
+    "fmla z26.s, z19.s, z1.s[2]\n"
+    "fmla z17.s, z19.s, z1.s[3]\n"
+    "fmla z23.s, z19.s, z0.s[0]\n"
+    "mov z1.d, z10.d\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "mov z0.d, z11.d\n"
+    "fmla z16.s, z18.s, z1.s[2]\n"
+    "fmla z25.s, z18.s, z1.s[3]\n"
+    "fmla z15.s, z18.s, z0.s[0]\n"
+    "fmla z24.s, z18.s, z0.s[1]\n"
+    "mov z1.d, z12.d\n"
+    "mov z0.d, z13.d\n"
+    "fmla z14.s, z18.s, z1.s[2]\n"
+    "fmla z26.s, z18.s, z1.s[3]\n"
+    "fmla z17.s, z18.s, z0.s[0]\n"
+    "fmla z23.s, z18.s, z0.s[1]\n"
+    "mov z1.d, z10.d\n"
+    "ld1w { z29.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "mov z0.d, z11.d\n"
+    "fmla z16.s, z28.s, z1.s[3]\n"
+    "fmla z25.s, z28.s, z0.s[0]\n"
+    "fmla z15.s, z28.s, z0.s[1]\n"
+    "fmla z24.s, z28.s, z0.s[2]\n"
+    "mov z0.d, z13.d\n"
+    "mov z1.d, z12.d\n"
+    "fmla z26.s, z28.s, z0.s[0]\n"
+    "fmla z17.s, z28.s, z0.s[1]\n"
+    "fmla z23.s, z28.s, z0.s[2]\n"
+    "mov z0.d, z11.d\n"
+    "fmla z14.s, z28.s, z1.s[3]\n"
+    "fmla z16.s, z27.s, z0.s[0]\n"
+    "fmla z25.s, z27.s, z0.s[1]\n"
+    "fmin z16.s, p1/M, z16.s, z22.s\n"
+    "fmax z16.s, p1/M, z16.s, z21.s\n"
+    "fmla z15.s, z27.s, z0.s[2]\n"
+    "fmla z24.s, z27.s, z0.s[3]\n"
+    "mov z0.d, z13.d\n"
+    "fmin z25.s, p1/M, z25.s, z22.s\n"
+    "fmla z14.s, z27.s, z0.s[0]\n"
+    "fmla z26.s, z27.s, z0.s[1]\n"
+    "fmin z15.s, p1/M, z15.s, z22.s\n"
+    "fmin z24.s, p1/M, z24.s, z22.s\n"
+    "fmla z17.s, z27.s, z0.s[2]\n"
+    "fmla z23.s, z27.s, z0.s[3]\n"
+    "fmin z14.s, p1/M, z14.s, z22.s\n"
+    "fmin z26.s, p1/M, z26.s, z22.s\n"
+    "fmin z17.s, p1/M, z17.s, z22.s\n"
+    "fmin z23.s, p1/M, z23.s, z22.s\n"
+    "st1w { z16.s }, p0, [x27, x10, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "fmax z25.s, p1/M, z25.s, z21.s\n"
+    "st1w { z25.s }, p0, [x26, x10, LSL #2]\n"
+    "mov z25.d, z16.d\n"
+    "fmax z15.s, p1/M, z15.s, z21.s\n"
+    "fmax z24.s, p1/M, z24.s, z21.s\n"
+    "st1w { z15.s }, p0, [x25, x10, LSL #2]\n"
+    "mov z15.d, z16.d\n"
+    "fmax z14.s, p1/M, z14.s, z21.s\n"
+    "fmax z26.s, p1/M, z26.s, z21.s\n"
+    "st1w { z24.s }, p0, [x24, x10, LSL #2]\n"
+    "mov z24.d, z16.d\n"
+    "fmax z17.s, p1/M, z17.s, z21.s\n"
+    "fmax z23.s, p1/M, z23.s, z21.s\n"
+    "st1w { z14.s }, p0, [x23, x10, LSL #2]\n"
+    "mov z14.d, z16.d\n"
+    "st1w { z26.s }, p0, [x22, x10, LSL #2]\n"
+    "mov z26.d, z16.d\n"
+    "ld1w { z28.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "ld1w { z27.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "st1w { z17.s }, p0, [x21, x10, LSL #2]\n"
+    "mov z17.d, z16.d\n"
+    "addvl %x[params], %x[params], #-6\n"
+    "st1w { z23.s }, p0, [x20, x10, LSL #2]\n"
+    "incw x10\n"
+    "mov z23.d, z16.d\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..f83767d8ae
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
+
+struct sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<float, float, float, float>
+{
+  using Parent = GenericDepthfirstMultiplierKernelStrategy<float, float, float, float>;
+  sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+  : Parent(2, 8, arm_gemm::VLType::SVE)
+  {
+  }
+  Parent::KernelType kernel = sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..1770ec182c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,454 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const float *weights,
+  const float *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ptrue p1.b\n"
+    "mov x9, #0x0\n"
+    "ld1rw { z15.s }, p1/Z, [%x[minmax_vals]]\n"
+    "ld1rw { z14.s }, p1/Z, [%x[minmax_vals], #4]\n"
+    "whilelt p0.s, x9, %x[n_output_channels]\n"
+    "1:"  // Output channel loop
+    "mov z31.b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ld1w { z31.s }, p0/Z, [%x[bias], x9, LSL #2]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "mov x23, %x[inptrs]\n"
+    "ldp x21, x20, [x23], #0x10\n"
+    "lsr x22, %x[kernel_points], #0x1\n"
+    "mov z16.d, z31.d\n"
+    "mov z17.d, z31.d\n"
+    "mov z18.d, z31.d\n"
+    "ld1rqw { z6.s }, p1/Z, [x21]\n"
+    "ld1rqw { z5.s }, p1/Z, [x21, #16]\n"
+    "mov z19.d, z31.d\n"
+    "mov z20.d, z31.d\n"
+    "ld1rqw { z1.s }, p1/Z, [x20]\n"
+    "ld1rqw { z2.s }, p1/Z, [x20, #16]\n"
+    "mov z21.d, z31.d\n"
+    "mov z22.d, z31.d\n"
+    "ld1w { z8.s }, p1/Z, [%x[weights]]\n"
+    "addvl %x[weights], %x[weights], #1\n"
+    "mov z23.d, z31.d\n"
+    "mov z24.d, z31.d\n"
+    "mov z25.d, z31.d\n"
+    "mov z26.d, z31.d\n"
+    "mov z27.d, z31.d\n"
+    "mov z28.d, z31.d\n"
+    "mov z29.d, z31.d\n"
+    "mov z30.d, z31.d\n"
+    "mov z31.d, z31.d\n"
+    "cbz x22, 6f\n"
+    "ldp x21, x20, [x23], #0x10\n"
+    "subs x22, x22, #0x1\n"
+    "ld1rqw { z0.s }, p1/Z, [x21]\n"
+    "ld1rqw { z4.s }, p1/Z, [x21, #16]\n"
+    "ld1rqw { z7.s }, p1/Z, [x20]\n"
+    "ld1rqw { z3.s }, p1/Z, [x20, #16]\n"
+    "ld1w { z11.s }, p1/Z, [%x[weights]]\n"
+    "addvl %x[weights], %x[weights], #1\n"
+    "beq 4f\n"
+    "3:"  // Output channel loop: Kernel loop
+    "ldp x21, x20, [x23], #0x10\n"
+    "fmla z16.s, z8.s, z6.s[0]\n"
+    "fmla z17.s, z8.s, z6.s[1]\n"
+    "subs x22, x22, #0x1\n"
+    "fmla z18.s, z8.s, z6.s[2]\n"
+    "fmla z19.s, z8.s, z6.s[3]\n"
+    "ld1rqw { z6.s }, p1/Z, [x21]\n"
+    "fmla z20.s, z8.s, z5.s[0]\n"
+    "fmla z21.s, z8.s, z5.s[1]\n"
+    "fmla z22.s, z8.s, z5.s[2]\n"
+    "fmla z23.s, z8.s, z5.s[3]\n"
+    "ld1rqw { z5.s }, p1/Z, [x21, #16]\n"
+    "fmla z24.s, z8.s, z1.s[0]\n"
+    "fmla z25.s, z8.s, z1.s[1]\n"
+    "fmla z26.s, z8.s, z1.s[2]\n"
+    "fmla z27.s, z8.s, z1.s[3]\n"
+    "ld1rqw { z1.s }, p1/Z, [x20]\n"
+    "fmla z28.s, z8.s, z2.s[0]\n"
+    "fmla z29.s, z8.s, z2.s[1]\n"
+    "fmla z30.s, z8.s, z2.s[2]\n"
+    "fmla z31.s, z8.s, z2.s[3]\n"
+    "ld1rqw { z2.s }, p1/Z, [x20, #16]\n"
+    "ldp x21, x20, [x23], #0x10\n"
+    "ld1w { z8.s }, p1/Z, [%x[weights]]\n"
+    "fmla z16.s, z11.s, z0.s[0]\n"
+    "fmla z17.s, z11.s, z0.s[1]\n"
+    "fmla z18.s, z11.s, z0.s[2]\n"
+    "fmla z19.s, z11.s, z0.s[3]\n"
+    "ld1rqw { z0.s }, p1/Z, [x21]\n"
+    "fmla z20.s, z11.s, z4.s[0]\n"
+    "fmla z21.s, z11.s, z4.s[1]\n"
+    "fmla z22.s, z11.s, z4.s[2]\n"
+    "fmla z23.s, z11.s, z4.s[3]\n"
+    "ld1rqw { z4.s }, p1/Z, [x21, #16]\n"
+    "fmla z24.s, z11.s, z7.s[0]\n"
+    "fmla z25.s, z11.s, z7.s[1]\n"
+    "fmla z26.s, z11.s, z7.s[2]\n"
+    "fmla z27.s, z11.s, z7.s[3]\n"
+    "ld1rqw { z7.s }, p1/Z, [x20]\n"
+    "fmla z28.s, z11.s, z3.s[0]\n"
+    "fmla z29.s, z11.s, z3.s[1]\n"
+    "fmla z30.s, z11.s, z3.s[2]\n"
+    "fmla z31.s, z11.s, z3.s[3]\n"
+    "ld1rqw { z3.s }, p1/Z, [x20, #16]\n"
+    "ld1w { z11.s }, p1/Z, [%x[weights], #1, MUL VL]\n"
+    "addvl %x[weights], %x[weights], #2\n"
+    "bgt 3b\n"
+    "4:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 5f\n"
+    "fmla z16.s, z8.s, z6.s[0]\n"
+    "fmla z17.s, z8.s, z6.s[1]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "fmla z18.s, z8.s, z6.s[2]\n"
+    "fmla z19.s, z8.s, z6.s[3]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "fmla z20.s, z8.s, z5.s[0]\n"
+    "fmla z21.s, z8.s, z5.s[1]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "fmla z22.s, z8.s, z5.s[2]\n"
+    "fmla z23.s, z8.s, z5.s[3]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmla z24.s, z8.s, z1.s[0]\n"
+    "fmla z25.s, z8.s, z1.s[1]\n"
+    "fmla z26.s, z8.s, z1.s[2]\n"
+    "fmla z27.s, z8.s, z1.s[3]\n"
+    "fmla z28.s, z8.s, z2.s[0]\n"
+    "fmla z29.s, z8.s, z2.s[1]\n"
+    "fmla z30.s, z8.s, z2.s[2]\n"
+    "fmla z31.s, z8.s, z2.s[3]\n"
+    "fmla z16.s, z11.s, z0.s[0]\n"
+    "fmla z17.s, z11.s, z0.s[1]\n"
+    "fmin z16.s, p1/M, z16.s, z14.s\n"
+    "fmin z17.s, p1/M, z17.s, z14.s\n"
+    "fmla z18.s, z11.s, z0.s[2]\n"
+    "fmla z19.s, z11.s, z0.s[3]\n"
+    "fmin z18.s, p1/M, z18.s, z14.s\n"
+    "fmin z19.s, p1/M, z19.s, z14.s\n"
+    "fmla z20.s, z11.s, z4.s[0]\n"
+    "fmla z21.s, z11.s, z4.s[1]\n"
+    "fmin z20.s, p1/M, z20.s, z14.s\n"
+    "fmin z21.s, p1/M, z21.s, z14.s\n"
+    "fmla z22.s, z11.s, z4.s[2]\n"
+    "fmla z23.s, z11.s, z4.s[3]\n"
+    "fmin z22.s, p1/M, z22.s, z14.s\n"
+    "fmin z23.s, p1/M, z23.s, z14.s\n"
+    "fmla z24.s, z11.s, z7.s[0]\n"
+    "fmla z25.s, z11.s, z7.s[1]\n"
+    "fmax z16.s, p1/M, z16.s, z15.s\n"
+    "fmax z17.s, p1/M, z17.s, z15.s\n"
+    "fmla z26.s, z11.s, z7.s[2]\n"
+    "fmla z27.s, z11.s, z7.s[3]\n"
+    "fmax z18.s, p1/M, z18.s, z15.s\n"
+    "fmax z19.s, p1/M, z19.s, z15.s\n"
+    "fmla z28.s, z11.s, z3.s[0]\n"
+    "fmla z29.s, z11.s, z3.s[1]\n"
+    "fmax z20.s, p1/M, z20.s, z15.s\n"
+    "fmax z21.s, p1/M, z21.s, z15.s\n"
+    "fmla z30.s, z11.s, z3.s[2]\n"
+    "fmla z31.s, z11.s, z3.s[3]\n"
+    "fmax z22.s, p1/M, z22.s, z15.s\n"
+    "fmax z23.s, p1/M, z23.s, z15.s\n"
+    "fmin z24.s, p1/M, z24.s, z14.s\n"
+    "fmin z25.s, p1/M, z25.s, z14.s\n"
+    "st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmin z26.s, p1/M, z26.s, z14.s\n"
+    "fmin z27.s, p1/M, z27.s, z14.s\n"
+    "st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmin z28.s, p1/M, z28.s, z14.s\n"
+    "fmin z29.s, p1/M, z29.s, z14.s\n"
+    "st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmin z30.s, p1/M, z30.s, z14.s\n"
+    "fmin z31.s, p1/M, z31.s, z14.s\n"
+    "st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmax z24.s, p1/M, z24.s, z15.s\n"
+    "fmax z25.s, p1/M, z25.s, z15.s\n"
+    "st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmax z26.s, p1/M, z26.s, z15.s\n"
+    "fmax z27.s, p1/M, z27.s, z15.s\n"
+    "st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "fmax z28.s, p1/M, z28.s, z15.s\n"
+    "fmax z29.s, p1/M, z29.s, z15.s\n"
+    "st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax z30.s, p1/M, z30.s, z15.s\n"
+    "fmax z31.s, p1/M, z31.s, z15.s\n"
+    "st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
+    "st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
+    "st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
+    "st1w { z27.s }, p0, [x24, x9, LSL #2]\n"
+    "st1w { z28.s }, p0, [x23, x9, LSL #2]\n"
+    "st1w { z29.s }, p0, [x22, x9, LSL #2]\n"
+    "st1w { z30.s }, p0, [x21, x9, LSL #2]\n"
+    "st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
+    "b 7f\n"
+    "5:"  // Output channel loop: Odd tail
+    "fmla z16.s, z8.s, z6.s[0]\n"
+    "fmla z17.s, z8.s, z6.s[1]\n"
+    "ldp x20, x28, [x23], #0x10\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "fmla z18.s, z8.s, z6.s[2]\n"
+    "fmla z19.s, z8.s, z6.s[3]\n"
+    "ld1rqw { z6.s }, p1/Z, [x20]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "fmla z20.s, z8.s, z5.s[0]\n"
+    "fmla z21.s, z8.s, z5.s[1]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "fmla z22.s, z8.s, z5.s[2]\n"
+    "fmla z23.s, z8.s, z5.s[3]\n"
+    "ld1rqw { z5.s }, p1/Z, [x20, #16]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla z24.s, z8.s, z1.s[0]\n"
+    "fmla z25.s, z8.s, z1.s[1]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "fmla z26.s, z8.s, z1.s[2]\n"
+    "fmla z27.s, z8.s, z1.s[3]\n"
+    "ld1rqw { z1.s }, p1/Z, [x28]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmla z28.s, z8.s, z2.s[0]\n"
+    "fmla z29.s, z8.s, z2.s[1]\n"
+    "fmla z30.s, z8.s, z2.s[2]\n"
+    "fmla z31.s, z8.s, z2.s[3]\n"
+    "ld1w { z10.s }, p1/Z, [%x[weights]]\n"
+    "ld1rqw { z2.s }, p1/Z, [x28, #16]\n"
+    "fmla z16.s, z11.s, z0.s[0]\n"
+    "fmla z17.s, z11.s, z0.s[1]\n"
+    "addvl %x[weights], %x[weights], #1\n"
+    "fmla z18.s, z11.s, z0.s[2]\n"
+    "fmla z19.s, z11.s, z0.s[3]\n"
+    "fmla z20.s, z11.s, z4.s[0]\n"
+    "fmla z21.s, z11.s, z4.s[1]\n"
+    "fmla z22.s, z11.s, z4.s[2]\n"
+    "fmla z23.s, z11.s, z4.s[3]\n"
+    "fmla z24.s, z11.s, z7.s[0]\n"
+    "fmla z25.s, z11.s, z7.s[1]\n"
+    "fmla z26.s, z11.s, z7.s[2]\n"
+    "fmla z27.s, z11.s, z7.s[3]\n"
+    "fmla z28.s, z11.s, z3.s[0]\n"
+    "fmla z29.s, z11.s, z3.s[1]\n"
+    "fmla z30.s, z11.s, z3.s[2]\n"
+    "fmla z31.s, z11.s, z3.s[3]\n"
+    "fmla z16.s, z10.s, z6.s[0]\n"
+    "fmla z17.s, z10.s, z6.s[1]\n"
+    "fmin z16.s, p1/M, z16.s, z14.s\n"
+    "fmin z17.s, p1/M, z17.s, z14.s\n"
+    "fmla z18.s, z10.s, z6.s[2]\n"
+    "fmla z19.s, z10.s, z6.s[3]\n"
+    "fmin z18.s, p1/M, z18.s, z14.s\n"
+    "fmin z19.s, p1/M, z19.s, z14.s\n"
+    "fmla z20.s, z10.s, z5.s[0]\n"
+    "fmla z21.s, z10.s, z5.s[1]\n"
+    "fmin z20.s, p1/M, z20.s, z14.s\n"
+    "fmin z21.s, p1/M, z21.s, z14.s\n"
+    "fmla z22.s, z10.s, z5.s[2]\n"
+    "fmla z23.s, z10.s, z5.s[3]\n"
+    "fmin z22.s, p1/M, z22.s, z14.s\n"
+    "fmin z23.s, p1/M, z23.s, z14.s\n"
+    "fmla z24.s, z10.s, z1.s[0]\n"
+    "fmla z25.s, z10.s, z1.s[1]\n"
+    "fmax z16.s, p1/M, z16.s, z15.s\n"
+    "fmax z17.s, p1/M, z17.s, z15.s\n"
+    "fmla z26.s, z10.s, z1.s[2]\n"
+    "fmla z27.s, z10.s, z1.s[3]\n"
+    "fmax z18.s, p1/M, z18.s, z15.s\n"
+    "fmax z19.s, p1/M, z19.s, z15.s\n"
+    "fmla z28.s, z10.s, z2.s[0]\n"
+    "fmla z29.s, z10.s, z2.s[1]\n"
+    "fmax z20.s, p1/M, z20.s, z15.s\n"
+    "fmax z21.s, p1/M, z21.s, z15.s\n"
+    "fmla z30.s, z10.s, z2.s[2]\n"
+    "fmla z31.s, z10.s, z2.s[3]\n"
+    "fmax z22.s, p1/M, z22.s, z15.s\n"
+    "fmax z23.s, p1/M, z23.s, z15.s\n"
+    "fmin z24.s, p1/M, z24.s, z14.s\n"
+    "fmin z25.s, p1/M, z25.s, z14.s\n"
+    "st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmin z26.s, p1/M, z26.s, z14.s\n"
+    "fmin z27.s, p1/M, z27.s, z14.s\n"
+    "st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmin z28.s, p1/M, z28.s, z14.s\n"
+    "fmin z29.s, p1/M, z29.s, z14.s\n"
+    "st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmin z30.s, p1/M, z30.s, z14.s\n"
+    "fmin z31.s, p1/M, z31.s, z14.s\n"
+    "st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmax z24.s, p1/M, z24.s, z15.s\n"
+    "fmax z25.s, p1/M, z25.s, z15.s\n"
+    "st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmax z26.s, p1/M, z26.s, z15.s\n"
+    "fmax z27.s, p1/M, z27.s, z15.s\n"
+    "st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "fmax z28.s, p1/M, z28.s, z15.s\n"
+    "fmax z29.s, p1/M, z29.s, z15.s\n"
+    "st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax z30.s, p1/M, z30.s, z15.s\n"
+    "fmax z31.s, p1/M, z31.s, z15.s\n"
+    "st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
+    "st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
+    "st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
+    "st1w { z27.s }, p0, [x24, x9, LSL #2]\n"
+    "st1w { z28.s }, p0, [x23, x9, LSL #2]\n"
+    "st1w { z29.s }, p0, [x22, x9, LSL #2]\n"
+    "st1w { z30.s }, p0, [x21, x9, LSL #2]\n"
+    "st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
+    "b 7f\n"
+    "6:"  // Output channel loop: Single kernel point
+    "fmla z16.s, z8.s, z6.s[0]\n"
+    "fmla z17.s, z8.s, z6.s[1]\n"
+    "fmin z16.s, p1/M, z16.s, z14.s\n"
+    "fmin z17.s, p1/M, z17.s, z14.s\n"
+    "fmla z18.s, z8.s, z6.s[2]\n"
+    "fmla z19.s, z8.s, z6.s[3]\n"
+    "fmin z18.s, p1/M, z18.s, z14.s\n"
+    "fmin z19.s, p1/M, z19.s, z14.s\n"
+    "fmla z20.s, z8.s, z5.s[0]\n"
+    "fmla z21.s, z8.s, z5.s[1]\n"
+    "fmin z20.s, p1/M, z20.s, z14.s\n"
+    "fmin z21.s, p1/M, z21.s, z14.s\n"
+    "fmla z22.s, z8.s, z5.s[2]\n"
+    "fmla z23.s, z8.s, z5.s[3]\n"
+    "fmin z22.s, p1/M, z22.s, z14.s\n"
+    "fmin z23.s, p1/M, z23.s, z14.s\n"
+    "fmla z24.s, z8.s, z1.s[0]\n"
+    "fmla z25.s, z8.s, z1.s[1]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "fmla z26.s, z8.s, z1.s[2]\n"
+    "fmla z27.s, z8.s, z1.s[3]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "fmla z28.s, z8.s, z2.s[0]\n"
+    "fmla z29.s, z8.s, z2.s[1]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "fmla z30.s, z8.s, z2.s[2]\n"
+    "fmla z31.s, z8.s, z2.s[3]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmax z16.s, p1/M, z16.s, z15.s\n"
+    "fmax z17.s, p1/M, z17.s, z15.s\n"
+    "st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmax z18.s, p1/M, z18.s, z15.s\n"
+    "fmax z19.s, p1/M, z19.s, z15.s\n"
+    "st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmax z20.s, p1/M, z20.s, z15.s\n"
+    "fmax z21.s, p1/M, z21.s, z15.s\n"
+    "st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmax z22.s, p1/M, z22.s, z15.s\n"
+    "fmax z23.s, p1/M, z23.s, z15.s\n"
+    "st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "fmin z24.s, p1/M, z24.s, z14.s\n"
+    "fmin z25.s, p1/M, z25.s, z14.s\n"
+    "st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmin z26.s, p1/M, z26.s, z14.s\n"
+    "fmin z27.s, p1/M, z27.s, z14.s\n"
+    "st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmin z28.s, p1/M, z28.s, z14.s\n"
+    "fmin z29.s, p1/M, z29.s, z14.s\n"
+    "st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "fmin z30.s, p1/M, z30.s, z14.s\n"
+    "fmin z31.s, p1/M, z31.s, z14.s\n"
+    "st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax z24.s, p1/M, z24.s, z15.s\n"
+    "fmax z25.s, p1/M, z25.s, z15.s\n"
+    "st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
+    "fmax z26.s, p1/M, z26.s, z15.s\n"
+    "fmax z27.s, p1/M, z27.s, z15.s\n"
+    "st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
+    "fmax z28.s, p1/M, z28.s, z15.s\n"
+    "fmax z29.s, p1/M, z29.s, z15.s\n"
+    "st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
+    "fmax z30.s, p1/M, z30.s, z15.s\n"
+    "fmax z31.s, p1/M, z31.s, z15.s\n"
+    "st1w { z27.s }, p0, [x24, x9, LSL #2]\n"
+    "st1w { z28.s }, p0, [x23, x9, LSL #2]\n"
+    "st1w { z29.s }, p0, [x22, x9, LSL #2]\n"
+    "st1w { z30.s }, p0, [x21, x9, LSL #2]\n"
+    "st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
+    "7:"  // Output channel loop: Done
+    "incw x9\n"
+    "whilelt p0.s, x9, %x[n_output_channels]\n"
+    "b.any 1b\n"
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
+    : "cc", "memory", "p0", "p1", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z10", "z11", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..04cf0d4036
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *);
+
+class sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleave_sve_s8q_3x3_dot::get_packed_size(args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleave_sve_s8q_3x3_dot::pack_parameters(
+      args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+      reinterpret_cast<const int8_t *>(weights), qp, ld_weight_col, ld_weight_row
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..0cee302c56
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,497 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
+{
+  __asm__ __volatile__(
+    "mov x14, #0x0\n"
+    "whilelt p0.b, x14, %x[n_channels]\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "ldp x25, x24, [%x[inptrs], #0x10]\n"
+    "ldp x23, x22, [%x[inptrs], #0x20]\n"
+    "ldp x13, x21, [%x[inptrs], #0x30]\n"
+    "mov x20, #0x1\n"
+    "ptrue p2.b\n"
+    "ldp x12, x11, [%x[outptrs], #0x0]\n"
+    "ldp x10, x9, [%x[outptrs], #0x10]\n"
+    "orr x20, x20, #0x100\n"
+    "orr x20, x20, #0x10000\n"
+    "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+    "ld1b { z21.b }, p0/Z, [x26, x14]\n"
+    "dup z25.s, w20\n"
+    "mov x28, #0x0\n"
+    "ldp x27, x26, [%x[inptrs], #0x40]\n"
+    "ld1b { z31.b }, p0/Z, [x25, x14]\n"
+    "zip2 z16.b, z15.b, z31.b\n"
+    "zip1 z15.b, z15.b, z31.b\n"
+    "ld1b { z29.b }, p0/Z, [x24, x14]\n"
+    "ldp x25, x24, [%x[inptrs], #0x50]\n"
+    "zip1 z30.b, z21.b, z29.b\n"
+    "zip2 z29.b, z21.b, z29.b\n"
+    "ld1b { z9.b }, p0/Z, [x23, x14]\n"
+    "ld1b { z20.b }, p0/Z, [x22, x14]\n"
+    "zip2 z13.b, z15.b, z30.b\n"
+    "zip1 z15.b, z15.b, z30.b\n"
+    "ldp x23, x22, [%x[inptrs], #0x60]\n"
+    "ld1b { z5.b }, p0/Z, [x13, x14]\n"
+    "zip1 z14.b, z16.b, z29.b\n"
+    "zip2 z29.b, z16.b, z29.b\n"
+    "ld1b { z17.b }, p0/Z, [x21, x14]\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "zip2 z31.b, z9.b, z5.b\n"
+    "zip1 z9.b, z9.b, z5.b\n"
+    "ld1b { z18.b }, p0/Z, [x27, x14]\n"
+    "ld1b { z28.b }, p0/Z, [x26, x14]\n"
+    "zip1 z21.b, z20.b, z17.b\n"
+    "zip2 z17.b, z20.b, z17.b\n"
+    "ld1b { z6.b }, p0/Z, [x25, x14]\n"
+    "ld1b { z4.b }, p0/Z, [x24, x14]\n"
+    "zip2 z23.b, z18.b, z6.b\n"
+    "zip1 z18.b, z18.b, z6.b\n"
+    "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+    "ld1b { z19.b }, p0/Z, [x22, x14]\n"
+    "zip1 z24.b, z28.b, z4.b\n"
+    "zip2 z4.b, z28.b, z4.b\n"
+    "ld1b { z16.b }, p0/Z, [x21, x14]\n"
+    "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+    "zip2 z22.b, z2.b, z16.b\n"
+    "zip1 z2.b, z2.b, z16.b\n"
+    "zip1 z0.b, z19.b, z5.b\n"
+    "zip2 z5.b, z19.b, z5.b\n"
+    "ld1w { z10.s }, p2/Z, [%x[params]]\n"
+    "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "zip2 z19.b, z9.b, z21.b\n"
+    "zip1 z9.b, z9.b, z21.b\n"
+    "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "zip1 z11.b, z31.b, z17.b\n"
+    "zip2 z17.b, z31.b, z17.b\n"
+    "ldp x25, x23, [%x[inptrs], #0x10]\n"
+    "ldp x24, x22, [%x[inptrs], #0x20]\n"
+    "zip2 z12.b, z18.b, z24.b\n"
+    "zip1 z18.b, z18.b, z24.b\n"
+    "ldp x21, x20, [%x[inptrs], #0x30]\n"
+    "zip1 z20.b, z23.b, z4.b\n"
+    "zip2 z4.b, z23.b, z4.b\n"
+    "ld1b { z26.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "zip2 z24.b, z2.b, z0.b\n"
+    "zip1 z2.b, z2.b, z0.b\n"
+    "ld1b { z3.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "zip1 z0.b, z22.b, z5.b\n"
+    "zip2 z5.b, z22.b, z5.b\n"
+    "addvl %x[params], %x[params], #4\n"
+    "mov z22.d, z10.d\n"
+    "mov z31.d, z10.d\n"
+    "mov z21.d, z10.d\n"
+    "1:"  // Loop
+    "mov z30.s, #0x0\n"
+    "sdot z30.s, z25.b, z9.b\n"
+    "sdot z10.s, z26.b, z15.b\n"
+    "whilelt p0.s, x28, %x[n_channels]\n"
+    "sdot z30.s, z25.b, z18.b\n"
+    "sdot z31.s, z26.b, z9.b\n"
+    "mov z27.s, #0x0\n"
+    "incw x14, ALL, MUL #4\n"
+    "sdot z10.s, z3.b, z9.b\n"
+    "ext z9.b, z9.b, z9.b, #0x1\n"
+    "movprfx z28, z30\n sdot z28.s, z25.b, z2.b\n"
+    "sdot z30.s, z25.b, z15.b\n"
+    "ext z15.b, z15.b, z15.b, #0x1\n"
+    "sdot z27.s, z25.b, z9.b\n"
+    "sdot z31.s, z3.b, z18.b\n"
+    "sdot z10.s, z1.b, z18.b\n"
+    "ext z18.b, z18.b, z18.b, #0x1\n"
+    "sdot z22.s, z26.b, z15.b\n"
+    "sdot z21.s, z26.b, z9.b\n"
+    "sdot z27.s, z25.b, z18.b\n"
+    "sdot z31.s, z1.b, z2.b\n"
+    "ext z2.b, z2.b, z2.b, #0x1\n"
+    "sdot z22.s, z3.b, z9.b\n"
+    "sdot z21.s, z3.b, z18.b\n"
+    "ld1w { z3.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "mls z10.s, p2/M, z30.s, z8.s\n"
+    "movprfx z26, z27\n sdot z26.s, z25.b, z2.b\n"
+    "mov z9.s, #0x0\n"
+    "sdot z27.s, z25.b, z15.b\n"
+    "ld1w { z23.s }, p2/Z, [%x[params]]\n"
+    "sdot z22.s, z1.b, z18.b\n"
+    ".inst 0x04b7754a  // sqrdmulh z10.s, z10.s, z23.s\n"
+    "sdot z21.s, z1.b, z2.b\n"
+    "mls z22.s, p2/M, z27.s, z8.s\n"
+    "and z18.d, z10.d, z3.d\n"
+    "mls z31.s, p2/M, z28.s, z8.s\n"
+    "mls z21.s, p2/M, z26.s, z8.s\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    ".inst 0x04b776d6  // sqrdmulh z22.s, z22.s, z23.s\n"
+    ".inst 0x04b777ff  // sqrdmulh z31.s, z31.s, z23.s\n"
+    "sdot z9.s, z25.b, z19.b\n"
+    ".inst 0x04b776b5  // sqrdmulh z21.s, z21.s, z23.s\n"
+    "sqadd z10.s, z10.s, z18.s\n"
+    ".inst 0x4482886a  // srshl z10.s, p2/M, z10.s, z3.s\n"
+    "sdot z9.s, z25.b, z12.b\n"
+    "and z28.d, z22.d, z3.d\n"
+    "and z23.d, z31.d, z3.d\n"
+    "movprfx z27, z9\n sdot z27.s, z25.b, z24.b\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "and z18.d, z21.d, z3.d\n"
+    "asr z28.s, z28.s, #0x1f\n"
+    "sdot z9.s, z25.b, z13.b\n"
+    "asr z23.s, z23.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z22.s, z22.s, z28.s\n"
+    "sqadd z31.s, z31.s, z23.s\n"
+    ".inst 0x44828876  // srshl z22.s, p2/M, z22.s, z3.s\n"
+    ".inst 0x4482887f  // srshl z31.s, p2/M, z31.s, z3.s\n"
+    "sqadd z21.s, z21.s, z18.s\n"
+    "add z10.s, z10.s, z16.s\n"
+    ".inst 0x44828875  // srshl z21.s, p2/M, z21.s, z3.s\n"
+    "smax z10.s, p2/M, z10.s, z7.s\n"
+    "add z22.s, z22.s, z16.s\n"
+    "add z31.s, z31.s, z16.s\n"
+    "smin z10.s, p2/M, z10.s, z6.s\n"
+    "smax z22.s, p2/M, z22.s, z7.s\n"
+    "add z21.s, z21.s, z16.s\n"
+    "smax z31.s, p2/M, z31.s, z7.s\n"
+    "smax z21.s, p2/M, z21.s, z7.s\n"
+    "st1b { z10.s }, p0, [x12, x28]\n"
+    "ld1w { z28.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "smin z22.s, p2/M, z22.s, z6.s\n"
+    "smin z31.s, p2/M, z31.s, z6.s\n"
+    "smin z21.s, p2/M, z21.s, z6.s\n"
+    "st1b { z22.s }, p0, [x11, x28]\n"
+    "mov z26.d, z28.d\n"
+    "ld1b { z15.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "st1b { z31.s }, p0, [x10, x28]\n"
+    "mov z31.d, z28.d\n"
+    "sdot z31.s, z1.b, z19.b\n"
+    "ld1b { z23.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "st1b { z21.s }, p0, [x9, x28]\n"
+    "mov z22.d, z28.d\n"
+    "sdot z28.s, z1.b, z13.b\n"
+    "sdot z28.s, z15.b, z19.b\n"
+    "ext z13.b, z13.b, z13.b, #0x1\n"
+    "ext z19.b, z19.b, z19.b, #0x1\n"
+    "sdot z26.s, z1.b, z13.b\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "mov z18.s, #0x0\n"
+    "sdot z22.s, z1.b, z19.b\n"
+    "sdot z18.s, z25.b, z19.b\n"
+    "incw x28\n"
+    "sdot z31.s, z15.b, z12.b\n"
+    "sdot z28.s, z23.b, z12.b\n"
+    "ext z12.b, z12.b, z12.b, #0x1\n"
+    "whilelt p0.s, x28, %x[n_channels]\n"
+    "sdot z26.s, z15.b, z19.b\n"
+    "sdot z22.s, z15.b, z12.b\n"
+    "addvl %x[params], %x[params], #16\n"
+    "sdot z18.s, z25.b, z12.b\n"
+    "sdot z31.s, z23.b, z24.b\n"
+    "ext z24.b, z24.b, z24.b, #0x1\n"
+    "mls z28.s, p2/M, z9.s, z8.s\n"
+    "sdot z26.s, z23.b, z12.b\n"
+    ".inst 0x04be779c  // sqrdmulh z28.s, z28.s, z30.s\n"
+    "sdot z22.s, z23.b, z24.b\n"
+    "movprfx z12, z18\n sdot z12.s, z25.b, z24.b\n"
+    "and z2.d, z28.d, z21.d\n"
+    "sdot z18.s, z25.b, z13.b\n"
+    "mls z26.s, p2/M, z18.s, z8.s\n"
+    "asr z2.s, z2.s, #0x1f\n"
+    "mls z31.s, p2/M, z27.s, z8.s\n"
+    "mls z22.s, p2/M, z12.s, z8.s\n"
+    ".inst 0x04be775a  // sqrdmulh z26.s, z26.s, z30.s\n"
+    ".inst 0x04be77ff  // sqrdmulh z31.s, z31.s, z30.s\n"
+    ".inst 0x04be76d6  // sqrdmulh z22.s, z22.s, z30.s\n"
+    "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+    "sqadd z28.s, z28.s, z2.s\n"
+    "and z24.d, z26.d, z21.d\n"
+    ".inst 0x44828abc  // srshl z28.s, p2/M, z28.s, z21.s\n"
+    "and z23.d, z31.d, z21.d\n"
+    "and z18.d, z22.d, z21.d\n"
+    "asr z24.s, z24.s, #0x1f\n"
+    "asr z23.s, z23.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z26.s, z26.s, z24.s\n"
+    ".inst 0x44828aba  // srshl z26.s, p2/M, z26.s, z21.s\n"
+    "ld1b { z30.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
+    "sqadd z31.s, z31.s, z23.s\n"
+    "sqadd z22.s, z22.s, z18.s\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    ".inst 0x44828ab6  // srshl z22.s, p2/M, z22.s, z21.s\n"
+    "add z28.s, z28.s, z16.s\n"
+    "smax z28.s, p2/M, z28.s, z7.s\n"
+    "add z26.s, z26.s, z16.s\n"
+    "smin z28.s, p2/M, z28.s, z6.s\n"
+    "add z31.s, z31.s, z16.s\n"
+    "add z22.s, z22.s, z16.s\n"
+    "smax z26.s, p2/M, z26.s, z7.s\n"
+    "smax z31.s, p2/M, z31.s, z7.s\n"
+    "mov z24.s, #0x0\n"
+    "sdot z24.s, z25.b, z11.b\n"
+    "smax z22.s, p2/M, z22.s, z7.s\n"
+    "st1b { z28.s }, p0, [x12, x28]\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "ld1b { z19.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "smin z26.s, p2/M, z26.s, z6.s\n"
+    "smin z31.s, p2/M, z31.s, z6.s\n"
+    "smin z22.s, p2/M, z22.s, z6.s\n"
+    "st1b { z26.s }, p0, [x11, x28]\n"
+    "mov z28.d, z23.d\n"
+    "sdot z24.s, z25.b, z20.b\n"
+    "st1b { z31.s }, p0, [x10, x28]\n"
+    "mov z27.d, z23.d\n"
+    "sdot z27.s, z19.b, z11.b\n"
+    "movprfx z13, z24\n sdot z13.s, z25.b, z0.b\n"
+    "st1b { z22.s }, p0, [x9, x28]\n"
+    "mov z26.d, z23.d\n"
+    "sdot z23.s, z19.b, z14.b\n"
+    "sdot z23.s, z30.b, z11.b\n"
+    "sdot z24.s, z25.b, z14.b\n"
+    "ext z14.b, z14.b, z14.b, #0x1\n"
+    "ld1b { z21.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+    "sdot z28.s, z19.b, z14.b\n"
+    "ext z11.b, z11.b, z11.b, #0x1\n"
+    "mov z12.s, #0x0\n"
+    "sdot z26.s, z19.b, z11.b\n"
+    "ld1w { z22.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+    "sdot z12.s, z25.b, z11.b\n"
+    "sdot z27.s, z30.b, z20.b\n"
+    "incw x28\n"
+    "whilelt p0.s, x28, %x[n_channels]\n"
+    "sdot z23.s, z21.b, z20.b\n"
+    "ext z20.b, z20.b, z20.b, #0x1\n"
+    "sdot z28.s, z30.b, z11.b\n"
+    "sdot z26.s, z30.b, z20.b\n"
+    "sdot z12.s, z25.b, z20.b\n"
+    "sdot z27.s, z21.b, z0.b\n"
+    "ext z0.b, z0.b, z0.b, #0x1\n"
+    "mls z23.s, p2/M, z24.s, z8.s\n"
+    "sdot z28.s, z21.b, z20.b\n"
+    "sdot z26.s, z21.b, z0.b\n"
+    ".inst 0x04a176f7  // sqrdmulh z23.s, z23.s, z1.s\n"
+    "movprfx z19, z12\n sdot z19.s, z25.b, z0.b\n"
+    "sdot z12.s, z25.b, z14.b\n"
+    "and z18.d, z23.d, z22.d\n"
+    "mls z28.s, p2/M, z12.s, z8.s\n"
+    "mls z27.s, p2/M, z13.s, z8.s\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "mls z26.s, p2/M, z19.s, z8.s\n"
+    ".inst 0x04a1779c  // sqrdmulh z28.s, z28.s, z1.s\n"
+    ".inst 0x04a1777b  // sqrdmulh z27.s, z27.s, z1.s\n"
+    ".inst 0x04a1775a  // sqrdmulh z26.s, z26.s, z1.s\n"
+    "ld1w { z2.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "sqadd z23.s, z23.s, z18.s\n"
+    "and z20.d, z28.d, z22.d\n"
+    ".inst 0x44828ad7  // srshl z23.s, p2/M, z23.s, z22.s\n"
+    "and z19.d, z27.d, z22.d\n"
+    "and z18.d, z26.d, z22.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z28.s, z28.s, z20.s\n"
+    ".inst 0x44828adc  // srshl z28.s, p2/M, z28.s, z22.s\n"
+    "ld1b { z13.b }, p2/Z, [%x[params]]\n"
+    "sqadd z27.s, z27.s, z19.s\n"
+    "sqadd z26.s, z26.s, z18.s\n"
+    ".inst 0x44828adb  // srshl z27.s, p2/M, z27.s, z22.s\n"
+    ".inst 0x44828ada  // srshl z26.s, p2/M, z26.s, z22.s\n"
+    "add z23.s, z23.s, z16.s\n"
+    "smax z23.s, p2/M, z23.s, z7.s\n"
+    "add z28.s, z28.s, z16.s\n"
+    "smin z23.s, p2/M, z23.s, z6.s\n"
+    "add z27.s, z27.s, z16.s\n"
+    "add z26.s, z26.s, z16.s\n"
+    "smax z28.s, p2/M, z28.s, z7.s\n"
+    "smax z27.s, p2/M, z27.s, z7.s\n"
+    "mov z24.s, #0x0\n"
+    "sdot z24.s, z25.b, z17.b\n"
+    "smax z26.s, p2/M, z26.s, z7.s\n"
+    "st1b { z23.s }, p0, [x12, x28]\n"
+    "ld1w { z1.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+    "ld1b { z21.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+    "smin z28.s, p2/M, z28.s, z6.s\n"
+    "smin z27.s, p2/M, z27.s, z6.s\n"
+    "smin z26.s, p2/M, z26.s, z6.s\n"
+    "st1b { z28.s }, p0, [x11, x28]\n"
+    "mov z0.d, z1.d\n"
+    "sdot z24.s, z25.b, z4.b\n"
+    "st1b { z27.s }, p0, [x10, x28]\n"
+    "mov z31.d, z1.d\n"
+    "sdot z31.s, z21.b, z17.b\n"
+    "movprfx z23, z24\n sdot z23.s, z25.b, z5.b\n"
+    "st1b { z26.s }, p0, [x9, x28]\n"
+    "mov z30.d, z1.d\n"
+    "sdot z1.s, z21.b, z29.b\n"
+    "sdot z1.s, z13.b, z17.b\n"
+    "sdot z24.s, z25.b, z29.b\n"
+    "ext z29.b, z29.b, z29.b, #0x1\n"
+    "ld1b { z20.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "sdot z0.s, z21.b, z29.b\n"
+    "ext z17.b, z17.b, z17.b, #0x1\n"
+    "mov z19.s, #0x0\n"
+    "sdot z30.s, z21.b, z17.b\n"
+    "ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "sdot z19.s, z25.b, z17.b\n"
+    "sdot z31.s, z13.b, z4.b\n"
+    "incw x28\n"
+    "whilelt p1.s, x28, %x[n_channels]\n"
+    "sdot z1.s, z20.b, z4.b\n"
+    "ext z4.b, z4.b, z4.b, #0x1\n"
+    "sdot z0.s, z13.b, z17.b\n"
+    "whilelt p0.b, x14, %x[n_channels]\n"
+    "sdot z30.s, z13.b, z4.b\n"
+    "sdot z19.s, z25.b, z4.b\n"
+    "ld1b { z13.b }, p0/Z, [x26, x14]\n"
+    "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+    "sdot z31.s, z20.b, z5.b\n"
+    "ext z5.b, z5.b, z5.b, #0x1\n"
+    "mls z1.s, p2/M, z24.s, z8.s\n"
+    "ld1b { z27.b }, p0/Z, [x22, x14]\n"
+    "sdot z0.s, z20.b, z4.b\n"
+    "sdot z30.s, z20.b, z5.b\n"
+    ".inst 0x04a27421  // sqrdmulh z1.s, z1.s, z2.s\n"
+    "ld1b { z26.b }, p0/Z, [x21, x14]\n"
+    "movprfx z18, z19\n sdot z18.s, z25.b, z5.b\n"
+    "sdot z19.s, z25.b, z29.b\n"
+    "and z11.d, z1.d, z22.d\n"
+    "ld1b { z29.b }, p0/Z, [x23, x14]\n"
+    "mls z0.s, p2/M, z19.s, z8.s\n"
+    "mls z31.s, p2/M, z23.s, z8.s\n"
+    "asr z11.s, z11.s, #0x1f\n"
+    "ld1b { z17.b }, p0/Z, [x20, x14]\n"
+    "mls z30.s, p2/M, z18.s, z8.s\n"
+    ".inst 0x04a27400  // sqrdmulh z0.s, z0.s, z2.s\n"
+    ".inst 0x04a277ff  // sqrdmulh z31.s, z31.s, z2.s\n"
+    ".inst 0x04a277de  // sqrdmulh z30.s, z30.s, z2.s\n"
+    "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+    "ldp x23, x22, [%x[inptrs], #0x40]\n"
+    "sqadd z1.s, z1.s, z11.s\n"
+    "and z21.d, z0.d, z22.d\n"
+    ".inst 0x44828ac1  // srshl z1.s, p2/M, z1.s, z22.s\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "and z20.d, z31.d, z22.d\n"
+    "and z19.d, z30.d, z22.d\n"
+    "ld1b { z18.b }, p0/Z, [x23, x14]\n"
+    "ld1b { z11.b }, p0/Z, [x22, x14]\n"
+    "asr z21.s, z21.s, #0x1f\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "ld1b { z24.b }, p0/Z, [x21, x14]\n"
+    "ld1b { z4.b }, p0/Z, [x20, x14]\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "sqadd z0.s, z0.s, z21.s\n"
+    ".inst 0x44828ac0  // srshl z0.s, p2/M, z0.s, z22.s\n"
+    "ld1b { z3.b }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "sqadd z31.s, z31.s, z20.s\n"
+    "sqadd z30.s, z30.s, z19.s\n"
+    ".inst 0x44828adf  // srshl z31.s, p2/M, z31.s, z22.s\n"
+    ".inst 0x44828ade  // srshl z30.s, p2/M, z30.s, z22.s\n"
+    "add z1.s, z1.s, z16.s\n"
+    "smax z1.s, p2/M, z1.s, z7.s\n"
+    "add z0.s, z0.s, z16.s\n"
+    "ld1b { z9.b }, p0/Z, [x24, x14]\n"
+    "add z31.s, z31.s, z16.s\n"
+    "add z30.s, z30.s, z16.s\n"
+    "ldp x23, x22, [%x[inptrs], #0x60]\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "smin z1.s, p2/M, z1.s, z6.s\n"
+    "smax z0.s, p2/M, z0.s, z7.s\n"
+    "st1b { z1.s }, p1, [x12, x28]\n"
+    "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+    "smax z31.s, p2/M, z31.s, z7.s\n"
+    "smax z30.s, p2/M, z30.s, z7.s\n"
+    "ld1b { z23.b }, p0/Z, [x22, x14]\n"
+    "ld1b { z22.b }, p0/Z, [x21, x14]\n"
+    "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+    "zip2 z20.b, z15.b, z28.b\n"
+    "zip1 z15.b, z15.b, z28.b\n"
+    "smin z0.s, p2/M, z0.s, z6.s\n"
+    "zip1 z19.b, z13.b, z29.b\n"
+    "zip2 z29.b, z13.b, z29.b\n"
+    "smin z31.s, p2/M, z31.s, z6.s\n"
+    "smin z30.s, p2/M, z30.s, z6.s\n"
+    "st1b { z0.s }, p1, [x11, x28]\n"
+    "zip2 z13.b, z15.b, z19.b\n"
+    "zip1 z15.b, z15.b, z19.b\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "st1b { z31.s }, p1, [x10, x28]\n"
+    "zip1 z14.b, z20.b, z29.b\n"
+    "zip2 z29.b, z20.b, z29.b\n"
+    "ld1w { z10.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "st1b { z30.s }, p1, [x9, x28]\n"
+    "zip2 z21.b, z9.b, z26.b\n"
+    "zip1 z9.b, z9.b, z26.b\n"
+    "incw x28\n"
+    "zip1 z20.b, z27.b, z17.b\n"
+    "zip2 z17.b, z27.b, z17.b\n"
+    "ldp x25, x23, [%x[inptrs], #0x10]\n"
+    "ldp x24, x22, [%x[inptrs], #0x20]\n"
+    "zip2 z31.b, z18.b, z24.b\n"
+    "zip1 z18.b, z18.b, z24.b\n"
+    "ldp x21, x20, [%x[inptrs], #0x30]\n"
+    "ld1b { z26.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "zip1 z27.b, z11.b, z4.b\n"
+    "zip2 z4.b, z11.b, z4.b\n"
+    "ld1b { z1.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "addvl %x[params], %x[params], #8\n"
+    "zip2 z30.b, z2.b, z22.b\n"
+    "zip1 z2.b, z2.b, z22.b\n"
+    "zip1 z28.b, z23.b, z5.b\n"
+    "zip2 z5.b, z23.b, z5.b\n"
+    "zip2 z19.b, z9.b, z20.b\n"
+    "zip1 z9.b, z9.b, z20.b\n"
+    "zip1 z11.b, z21.b, z17.b\n"
+    "zip2 z17.b, z21.b, z17.b\n"
+    "zip2 z12.b, z18.b, z27.b\n"
+    "zip1 z18.b, z18.b, z27.b\n"
+    "zip1 z20.b, z31.b, z4.b\n"
+    "zip2 z4.b, z31.b, z4.b\n"
+    "zip2 z24.b, z2.b, z28.b\n"
+    "zip1 z2.b, z2.b, z28.b\n"
+    "zip1 z0.b, z30.b, z5.b\n"
+    "zip2 z5.b, z30.b, z5.b\n"
+    "mov z22.d, z10.d\n"
+    "mov z31.d, z10.d\n"
+    "mov z21.d, z10.d\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..c9b4daf334
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+class sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..8ac522dc9a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x16, #0x0\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+    "ptrue p4.b\n"
+    "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "mov x23, x16\n"
+    "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+    "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z12.b }, p4/Z, [x21]\n"
+    "ld1rb { z30.b }, p4/Z, [x20]\n"
+    "add x21, x25, %[offsetof_Requantize32_minval]\n"
+    "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z24.h }, p4/Z, [x22]\n"
+    "ld1rh { z11.h }, p4/Z, [x21]\n"
+    "ld1rh { z26.h }, p4/Z, [x20]\n"
+    "ldp x13, x12, [x24, #0x0]\n"
+    "incw x23\n"
+    "whilelt p3.h, x16, x15\n"
+    "ldp x11, x10, [x24, #0x10]\n"
+    "whilelt p2.s, x16, x15\n"
+    "whilelt p1.s, x23, x15\n"
+    "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1sb { z14.h }, p4/Z, [x14]\n"
+    "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+    "add x28, %x[params], %[offsetof_Params_inptrs]\n"
+    "mov x27, #0x0\n"
+    "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+    "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+    ".inst 0x455e11ce  // ssublb z14.h, z14.b, z30.b\n"
+    ".inst 0x455e12b5  // ssublb z21.h, z21.b, z30.b\n"
+    "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+    "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+    ".inst 0x455e1021  // ssublb z1.h, z1.b, z30.b\n"
+    ".inst 0x455e10c6  // ssublb z6.h, z6.b, z30.b\n"
+    "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+    "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+    "inch x14, ALL, MUL #8\n"
+    ".inst 0x455e1042  // ssublb z2.h, z2.b, z30.b\n"
+    "ld1w { z17.s }, p2/Z, [x9]\n"
+    "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
+    "uzp1 z5.s, z17.s, z16.s\n"
+    "uzp2 z9.s, z17.s, z16.s\n"
+    "ld1sb { z8.h }, p4/Z, [x14]\n"
+    "ldp x24, x23, [x28, #0x0]\n"
+    "addvl x9, x9, #2\n"
+    "mov z17.d, z5.d\n"
+    "ldp x22, x21, [x28, #0x10]\n"
+    "ldr x20, [x28, #0x20]\n"
+    "mov z25.d, z9.d\n"
+    "mov z16.d, z5.d\n"
+    "ld1sb { z0.h }, p3/Z, [x24, x16]\n"
+    "ld1sb { z29.h }, p3/Z, [x23, x16]\n"
+    "mov z23.d, z9.d\n"
+    "mov z22.d, z5.d\n"
+    "ld1sb { z4.h }, p3/Z, [x22, x16]\n"
+    "ld1sb { z13.h }, p3/Z, [x21, x16]\n"
+    "mov z27.d, z9.d\n"
+    ".inst 0x455e1252  // ssublb z18.h, z18.b, z30.b\n"
+    "ld1sb { z20.h }, p3/Z, [x20, x16]\n"
+    "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    ".inst 0x455e10e7  // ssublb z7.h, z7.b, z30.b\n"
+    ".inst 0x455e114a  // ssublb z10.h, z10.b, z30.b\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x9, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455e1108  // ssublb z8.h, z8.b, z30.b\n"
+    ".inst 0x454c1000  // ssublb z0.h, z0.b, z12.b\n"
+    ".inst 0x454c13bd  // ssublb z29.h, z29.b, z12.b\n"
+    ".inst 0x454c1084  // ssublb z4.h, z4.b, z12.b\n"
+    ".inst 0x454c11ad  // ssublb z13.h, z13.b, z12.b\n"
+    ".inst 0x454c1294  // ssublb z20.h, z20.b, z12.b\n"
+    "1:"  // Loop
+    ".inst 0x44824005  // smlalb z5.s, p4/M, z0.h, z2.h\n"
+    ".inst 0x44824409  // smlalt z9.s, p4/M, z0.h, z2.h\n"
+    "ldr x20, [x28, #0x28]\n"
+    "ldr x21, [x28, #0x38]\n"
+    ".inst 0x448e43a5  // smlalb z5.s, p4/M, z29.h, z14.h\n"
+    ".inst 0x44864011  // smlalb z17.s, p4/M, z0.h, z6.h\n"
+    "ld1sb { z3.h }, p3/Z, [x20, x16]\n"
+    "ldr x20, [x28, #0x30]\n"
+    ".inst 0x44954010  // smlalb z16.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x448e4016  // smlalb z22.s, p4/M, z0.h, z14.h\n"
+    "ld1sb { z31.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x454c1063  // ssublb z3.h, z3.b, z12.b\n"
+    ".inst 0x448e47a9  // smlalt z9.s, p4/M, z29.h, z14.h\n"
+    ".inst 0x449241a5  // smlalb z5.s, p4/M, z13.h, z18.h\n"
+    "ldr x21, [x28, #0x40]\n"
+    "ld1sb { z15.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x44864419  // smlalt z25.s, p4/M, z0.h, z6.h\n"
+    ".inst 0x44954417  // smlalt z23.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x454c13ff  // ssublb z31.h, z31.b, z12.b\n"
+    "ldr x20, [x28, #0x48]\n"
+    ".inst 0x448e441b  // smlalt z27.s, p4/M, z0.h, z14.h\n"
+    ".inst 0x44814091  // smlalb z17.s, p4/M, z4.h, z1.h\n"
+    "ld1sb { z19.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x454c11ef  // ssublb z15.h, z15.b, z12.b\n"
+    ".inst 0x448141b0  // smlalb z16.s, p4/M, z13.h, z1.h\n"
+    ".inst 0x449541b6  // smlalb z22.s, p4/M, z13.h, z21.h\n"
+    "ld1sb { z28.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x454c1273  // ssublb z19.h, z19.b, z12.b\n"
+    ".inst 0x449245a9  // smlalt z9.s, p4/M, z13.h, z18.h\n"
+    ".inst 0x448a4285  // smlalb z5.s, p4/M, z20.h, z10.h\n"
+    "ldr x21, [x28, #0x50]\n"
+    "ldr x20, [x28, #0x58]\n"
+    ".inst 0x44814499  // smlalt z25.s, p4/M, z4.h, z1.h\n"
+    ".inst 0x448145b7  // smlalt z23.s, p4/M, z13.h, z1.h\n"
+    ".inst 0x454c139c  // ssublb z28.h, z28.b, z12.b\n"
+    "ld1sb { z4.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x449545bb  // smlalt z27.s, p4/M, z13.h, z21.h\n"
+    ".inst 0x448241b1  // smlalb z17.s, p4/M, z13.h, z2.h\n"
+    "ld1sb { z29.h }, p3/Z, [x20, x16]\n"
+    "ldr x21, [x28, #0x60]\n"
+    ".inst 0x44874070  // smlalb z16.s, p4/M, z3.h, z7.h\n"
+    ".inst 0x44864296  // smlalb z22.s, p4/M, z20.h, z6.h\n"
+    "ldr x20, [x28, #0x68]\n"
+    ".inst 0x454c1084  // ssublb z4.h, z4.b, z12.b\n"
+    ".inst 0x448a4689  // smlalt z9.s, p4/M, z20.h, z10.h\n"
+    ".inst 0x449543e5  // smlalb z5.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454c13bd  // ssublb z29.h, z29.b, z12.b\n"
+    "ld1sb { z0.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x448245b9  // smlalt z25.s, p4/M, z13.h, z2.h\n"
+    ".inst 0x44874477  // smlalt z23.s, p4/M, z3.h, z7.h\n"
+    "ld1sb { z3.h }, p3/Z, [x20, x16]\n"
+    "ldr x20, [x28, #0x70]\n"
+    ".inst 0x4486469b  // smlalt z27.s, p4/M, z20.h, z6.h\n"
+    ".inst 0x44874291  // smlalb z17.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x454c1000  // ssublb z0.h, z0.b, z12.b\n"
+    "ld1sb { z13.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x44824290  // smlalb z16.s, p4/M, z20.h, z2.h\n"
+    ".inst 0x448841f6  // smlalb z22.s, p4/M, z15.h, z8.h\n"
+    ".inst 0x454c1063  // ssublb z3.h, z3.b, z12.b\n"
+    "ldr x20, [x28, #0x78]\n"
+    ".inst 0x449547e9  // smlalt z9.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x44814265  // smlalb z5.s, p4/M, z19.h, z1.h\n"
+    ".inst 0x454c11ad  // ssublb z13.h, z13.b, z12.b\n"
+    "whilelt p0.h, x27, x15\n"
+    ".inst 0x44874699  // smlalt z25.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x44824697  // smlalt z23.s, p4/M, z20.h, z2.h\n"
+    "ld1w { z20.s }, p2/Z, [x26]\n"
+    "inch x14\n"
+    ".inst 0x448845fb  // smlalt z27.s, p4/M, z15.h, z8.h\n"
+    ".inst 0x448e43f1  // smlalb z17.s, p4/M, z31.h, z14.h\n"
+    "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44924390  // smlalb z16.s, p4/M, z28.h, z18.h\n"
+    ".inst 0x44824396  // smlalb z22.s, p4/M, z28.h, z2.h\n"
+    "addvl x26, x26, #2\n"
+    ".inst 0x44814669  // smlalt z9.s, p4/M, z19.h, z1.h\n"
+    ".inst 0x44884385  // smlalb z5.s, p4/M, z28.h, z8.h\n"
+    ".inst 0x448e47f9  // smlalt z25.s, p4/M, z31.h, z14.h\n"
+    ".inst 0x44924797  // smlalt z23.s, p4/M, z28.h, z18.h\n"
+    "ld1sb { z31.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x454c13ff  // ssublb z31.h, z31.b, z12.b\n"
+    ".inst 0x4482479b  // smlalt z27.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x44954271  // smlalb z17.s, p4/M, z19.h, z21.h\n"
+    "uzp1 z2.s, z20.s, z15.s\n"
+    "inch x16\n"
+    ".inst 0x448e4090  // smlalb z16.s, p4/M, z4.h, z14.h\n"
+    ".inst 0x448143b6  // smlalb z22.s, p4/M, z29.h, z1.h\n"
+    "uzp2 z15.s, z20.s, z15.s\n"
+    "ld1w { z20.s }, p2/Z, [x25]\n"
+    ".inst 0x44884789  // smlalt z9.s, p4/M, z28.h, z8.h\n"
+    ".inst 0x44864085  // smlalb z5.s, p4/M, z4.h, z6.h\n"
+    "mov x20, x16\n"
+    "incw x20\n"
+    ".inst 0x44954679  // smlalt z25.s, p4/M, z19.h, z21.h\n"
+    ".inst 0x448e4497  // smlalt z23.s, p4/M, z4.h, z14.h\n"
+    "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
+    "uzp1 z21.s, z20.s, z19.s\n"
+    ".inst 0x448147bb  // smlalt z27.s, p4/M, z29.h, z1.h\n"
+    ".inst 0x448a4391  // smlalb z17.s, p4/M, z28.h, z10.h\n"
+    "uzp2 z1.s, z20.s, z19.s\n"
+    "whilelt p2.s, x16, x15\n"
+    ".inst 0x44864010  // smlalb z16.s, p4/M, z0.h, z6.h\n"
+    ".inst 0x44924076  // smlalb z22.s, p4/M, z3.h, z18.h\n"
+    "whilelt p1.s, x20, x15\n"
+    "whilelt p3.h, x16, x15\n"
+    ".inst 0x44864489  // smlalt z9.s, p4/M, z4.h, z6.h\n"
+    ".inst 0x44874005  // smlalb z5.s, p4/M, z0.h, z7.h\n"
+    ".inst 0x04a274a5  // sqrdmulh z5.s, z5.s, z2.s\n"
+    "addvl x25, x25, #2\n"
+    ".inst 0x448a4799  // smlalt z25.s, p4/M, z28.h, z10.h\n"
+    ".inst 0x44864417  // smlalt z23.s, p4/M, z0.h, z6.h\n"
+    "and z19.d, z5.d, z21.d\n"
+    ".inst 0x4492447b  // smlalt z27.s, p4/M, z3.h, z18.h\n"
+    ".inst 0x449243b1  // smlalb z17.s, p4/M, z29.h, z18.h\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    ".inst 0x448a41b0  // smlalb z16.s, p4/M, z13.h, z10.h\n"
+    ".inst 0x448741b6  // smlalb z22.s, p4/M, z13.h, z7.h\n"
+    "sqadd z5.s, z5.s, z19.s\n"
+    ".inst 0x448292a5  // srshl z5.s, p4/M, z5.s, z21.s\n"
+    ".inst 0x44874409  // smlalt z9.s, p4/M, z0.h, z7.h\n"
+    ".inst 0x449247b9  // smlalt z25.s, p4/M, z29.h, z18.h\n"
+    ".inst 0x04af7529  // sqrdmulh z9.s, z9.s, z15.s\n"
+    ".inst 0x448a45b7  // smlalt z23.s, p4/M, z13.h, z10.h\n"
+    ".inst 0x448745bb  // smlalt z27.s, p4/M, z13.h, z7.h\n"
+    "and z29.d, z9.d, z1.d\n"
+    ".inst 0x44884071  // smlalb z17.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x448843f0  // smlalb z16.s, p4/M, z31.h, z8.h\n"
+    ".inst 0x04a27631  // sqrdmulh z17.s, z17.s, z2.s\n"
+    ".inst 0x448a43f6  // smlalb z22.s, p4/M, z31.h, z10.h\n"
+    ".inst 0x44884479  // smlalt z25.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x04a27610  // sqrdmulh z16.s, z16.s, z2.s\n"
+    ".inst 0x448847f7  // smlalt z23.s, p4/M, z31.h, z8.h\n"
+    ".inst 0x448a47fb  // smlalt z27.s, p4/M, z31.h, z10.h\n"
+    ".inst 0x04a276d6  // sqrdmulh z22.s, z22.s, z2.s\n"
+    "asr z29.s, z29.s, #0x1f\n"
+    "and z18.d, z17.d, z21.d\n"
+    ".inst 0x04af7739  // sqrdmulh z25.s, z25.s, z15.s\n"
+    "and z20.d, z16.d, z21.d\n"
+    ".inst 0x04af76f7  // sqrdmulh z23.s, z23.s, z15.s\n"
+    "and z19.d, z22.d, z21.d\n"
+    ".inst 0x04af777b  // sqrdmulh z27.s, z27.s, z15.s\n"
+    "sqadd z9.s, z9.s, z29.s\n"
+    ".inst 0x44829029  // srshl z9.s, p4/M, z9.s, z1.s\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "and z7.d, z25.d, z1.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "and z6.d, z23.d, z1.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "and z2.d, z27.d, z1.d\n"
+    "sqadd z17.s, z17.s, z18.s\n"
+    "asr z7.s, z7.s, #0x1f\n"
+    ".inst 0x448292b1  // srshl z17.s, p4/M, z17.s, z21.s\n"
+    "sqadd z16.s, z16.s, z20.s\n"
+    "asr z6.s, z6.s, #0x1f\n"
+    ".inst 0x448292b0  // srshl z16.s, p4/M, z16.s, z21.s\n"
+    "sqadd z22.s, z22.s, z19.s\n"
+    "asr z2.s, z2.s, #0x1f\n"
+    ".inst 0x448292b6  // srshl z22.s, p4/M, z22.s, z21.s\n"
+    "sqadd z25.s, z25.s, z7.s\n"
+    "sqadd z23.s, z23.s, z6.s\n"
+    ".inst 0x44829039  // srshl z25.s, p4/M, z25.s, z1.s\n"
+    ".inst 0x44829037  // srshl z23.s, p4/M, z23.s, z1.s\n"
+    "sqadd z27.s, z27.s, z2.s\n"
+    ".inst 0x453040a5  // sqxtnb z5.h, z5.s\n"
+    ".inst 0x4482903b  // srshl z27.s, p4/M, z27.s, z1.s\n"
+    ".inst 0x45304231  // sqxtnb z17.h, z17.s\n"
+    ".inst 0x45304210  // sqxtnb z16.h, z16.s\n"
+    ".inst 0x453042d6  // sqxtnb z22.h, z22.s\n"
+    ".inst 0x45304525  // sqxtnt z5.h, z9.s\n"
+    ".inst 0x45304731  // sqxtnt z17.h, z25.s\n"
+    ".inst 0x453046f0  // sqxtnt z16.h, z23.s\n"
+    ".inst 0x45304776  // sqxtnt z22.h, z27.s\n"
+    "sqadd z5.h, z5.h, z24.h\n"
+    "smax z5.h, p4/M, z5.h, z11.h\n"
+    "smin z5.h, p4/M, z5.h, z26.h\n"
+    "sqadd z17.h, z17.h, z24.h\n"
+    "sqadd z16.h, z16.h, z24.h\n"
+    "smax z17.h, p4/M, z17.h, z11.h\n"
+    "smax z16.h, p4/M, z16.h, z11.h\n"
+    "sqadd z22.h, z22.h, z24.h\n"
+    "smax z22.h, p4/M, z22.h, z11.h\n"
+    "smin z17.h, p4/M, z17.h, z26.h\n"
+    "st1b { z5.h }, p0, [x13, x27]\n"
+    "smin z16.h, p4/M, z16.h, z26.h\n"
+    "smin z22.h, p4/M, z22.h, z26.h\n"
+    "st1b { z17.h }, p0, [x12, x27]\n"
+    "st1b { z16.h }, p0, [x11, x27]\n"
+    "st1b { z22.h }, p0, [x10, x27]\n"
+    "ld1sb { z14.h }, p4/Z, [x14]\n"
+    "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+    "inch x27\n"
+    "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+    "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+    ".inst 0x455e11ce  // ssublb z14.h, z14.b, z30.b\n"
+    ".inst 0x455e12b5  // ssublb z21.h, z21.b, z30.b\n"
+    "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+    "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+    ".inst 0x455e1021  // ssublb z1.h, z1.b, z30.b\n"
+    ".inst 0x455e10c6  // ssublb z6.h, z6.b, z30.b\n"
+    "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+    "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+    "inch x14, ALL, MUL #8\n"
+    ".inst 0x455e1042  // ssublb z2.h, z2.b, z30.b\n"
+    "ld1w { z17.s }, p2/Z, [x21]\n"
+    "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+    "uzp1 z5.s, z17.s, z16.s\n"
+    "uzp2 z9.s, z17.s, z16.s\n"
+    "ld1sb { z8.h }, p4/Z, [x14]\n"
+    "ldp x24, x23, [x28, #0x0]\n"
+    "addvl x21, x21, #2\n"
+    "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x22, x21, [x28, #0x10]\n"
+    "ldr x20, [x28, #0x20]\n"
+    "mov z17.d, z5.d\n"
+    "mov z25.d, z9.d\n"
+    "ld1sb { z0.h }, p3/Z, [x24, x16]\n"
+    "ld1sb { z29.h }, p3/Z, [x23, x16]\n"
+    "mov z16.d, z5.d\n"
+    "mov z23.d, z9.d\n"
+    "ld1sb { z4.h }, p3/Z, [x22, x16]\n"
+    "ld1sb { z13.h }, p3/Z, [x21, x16]\n"
+    "mov z22.d, z5.d\n"
+    "mov z27.d, z9.d\n"
+    "ld1sb { z20.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x455e1252  // ssublb z18.h, z18.b, z30.b\n"
+    ".inst 0x455e10e7  // ssublb z7.h, z7.b, z30.b\n"
+    ".inst 0x455e114a  // ssublb z10.h, z10.b, z30.b\n"
+    ".inst 0x455e1108  // ssublb z8.h, z8.b, z30.b\n"
+    ".inst 0x454c1000  // ssublb z0.h, z0.b, z12.b\n"
+    ".inst 0x454c13bd  // ssublb z29.h, z29.b, z12.b\n"
+    ".inst 0x454c1084  // ssublb z4.h, z4.b, z12.b\n"
+    ".inst 0x454c11ad  // ssublb z13.h, z13.b, z12.b\n"
+    ".inst 0x454c1294  // ssublb z20.h, z20.b, z12.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..7a9b8a5bde
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+class sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..fc9a48bb46
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x7, #0x0\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+    "ptrue p4.b\n"
+    "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "mov x23, x7\n"
+    "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+    "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z26.b }, p4/Z, [x21]\n"
+    "ld1rb { z13.b }, p4/Z, [x20]\n"
+    "add x21, x25, %[offsetof_Requantize32_minval]\n"
+    "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z19.h }, p4/Z, [x22]\n"
+    "ld1rh { z12.h }, p4/Z, [x21]\n"
+    "ld1rh { z9.h }, p4/Z, [x20]\n"
+    "ldp x16, x15, [x24, #0x0]\n"
+    "incw x23\n"
+    "whilelt p3.h, x7, x8\n"
+    "ldp x14, x13, [x24, #0x10]\n"
+    "whilelt p2.s, x7, x8\n"
+    "whilelt p1.s, x23, x8\n"
+    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1sb { z25.h }, p4/Z, [x17]\n"
+    "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+    "add x11, %x[params], %[offsetof_Params_inptrs]\n"
+    "mov x10, #0x0\n"
+    "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+    "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+    ".inst 0x454d1339  // ssublb z25.h, z25.b, z13.b\n"
+    ".inst 0x454d13de  // ssublb z30.h, z30.b, z13.b\n"
+    "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+    "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+    ".inst 0x454d11ce  // ssublb z14.h, z14.b, z13.b\n"
+    ".inst 0x454d1084  // ssublb z4.h, z4.b, z13.b\n"
+    "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+    "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+    "inch x17, ALL, MUL #8\n"
+    ".inst 0x454d114a  // ssublb z10.h, z10.b, z13.b\n"
+    "ld1w { z17.s }, p2/Z, [x12]\n"
+    "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
+    "uzp1 z8.s, z17.s, z16.s\n"
+    "uzp2 z24.s, z17.s, z16.s\n"
+    "ld1sb { z2.h }, p4/Z, [x17]\n"
+    "ldp x27, x26, [x11, #0x0]\n"
+    "addvl x12, x12, #2\n"
+    "mov z18.d, z8.d\n"
+    "ldp x25, x24, [x11, #0x10]\n"
+    "ldp x23, x22, [x11, #0x20]\n"
+    "mov z0.d, z24.d\n"
+    "mov z15.d, z8.d\n"
+    "ldp x21, x20, [x11, #0x30]\n"
+    "ld1sb { z21.h }, p3/Z, [x27, x7]\n"
+    "mov z1.d, z24.d\n"
+    "mov z5.d, z8.d\n"
+    "ld1sb { z22.h }, p3/Z, [x26, x7]\n"
+    "ld1sb { z11.h }, p3/Z, [x25, x7]\n"
+    "mov z6.d, z24.d\n"
+    ".inst 0x454d1063  // ssublb z3.h, z3.b, z13.b\n"
+    "ld1sb { z20.h }, p3/Z, [x24, x7]\n"
+    "ld1sb { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x454d12f7  // ssublb z23.h, z23.b, z13.b\n"
+    ".inst 0x454d10e7  // ssublb z7.h, z7.b, z13.b\n"
+    "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
+    "ld1sb { z16.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x454d1042  // ssublb z2.h, z2.b, z13.b\n"
+    ".inst 0x455a12b5  // ssublb z21.h, z21.b, z26.b\n"
+    "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
+    "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    ".inst 0x455a12d6  // ssublb z22.h, z22.b, z26.b\n"
+    ".inst 0x455a116b  // ssublb z11.h, z11.b, z26.b\n"
+    "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455a1294  // ssublb z20.h, z20.b, z26.b\n"
+    ".inst 0x455a137b  // ssublb z27.h, z27.b, z26.b\n"
+    ".inst 0x455a139c  // ssublb z28.h, z28.b, z26.b\n"
+    ".inst 0x455a1210  // ssublb z16.h, z16.b, z26.b\n"
+    ".inst 0x455a13ff  // ssublb z31.h, z31.b, z26.b\n"
+    "1:"  // Loop
+    ".inst 0x448242a8  // smlalb z8.s, p4/M, z21.h, z2.h\n"
+    "ldr x21, [x11, #0x58]\n"
+    "ldr x20, [x11, #0x78]\n"
+    ".inst 0x448246b8  // smlalt z24.s, p4/M, z21.h, z2.h\n"
+    ".inst 0x449942c8  // smlalb z8.s, p4/M, z22.h, z25.h\n"
+    "ld1sb { z17.h }, p3/Z, [x21, x7]\n"
+    "ld1sb { z29.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x449742b2  // smlalb z18.s, p4/M, z21.h, z23.h\n"
+    "ldr x21, [x11, #0x60]\n"
+    "ldr x20, [x11, #0x80]\n"
+    ".inst 0x448e42af  // smlalb z15.s, p4/M, z21.h, z14.h\n"
+    ".inst 0x449942a5  // smlalb z5.s, p4/M, z21.h, z25.h\n"
+    ".inst 0x449946d8  // smlalt z24.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x455a1231  // ssublb z17.h, z17.b, z26.b\n"
+    ".inst 0x449e4168  // smlalb z8.s, p4/M, z11.h, z30.h\n"
+    "ld1sb { z22.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x455a13bd  // ssublb z29.h, z29.b, z26.b\n"
+    ".inst 0x449746a0  // smlalt z0.s, p4/M, z21.h, z23.h\n"
+    ".inst 0x448e46a1  // smlalt z1.s, p4/M, z21.h, z14.h\n"
+    "ldr x21, [x11, #0x68]\n"
+    ".inst 0x449946a6  // smlalt z6.s, p4/M, z21.h, z25.h\n"
+    "ld1sb { z21.h }, p3/Z, [x20, x7]\n"
+    "ldr x20, [x11, #0x88]\n"
+    ".inst 0x449e4292  // smlalb z18.s, p4/M, z20.h, z30.h\n"
+    ".inst 0x4484422f  // smlalb z15.s, p4/M, z17.h, z4.h\n"
+    ".inst 0x448a43a5  // smlalb z5.s, p4/M, z29.h, z10.h\n"
+    ".inst 0x455a12d6  // ssublb z22.h, z22.b, z26.b\n"
+    "ldr x22, [x11, #0x40]\n"
+    ".inst 0x449e4578  // smlalt z24.s, p4/M, z11.h, z30.h\n"
+    ".inst 0x455a12b5  // ssublb z21.h, z21.b, z26.b\n"
+    ".inst 0x44844388  // smlalb z8.s, p4/M, z28.h, z4.h\n"
+    "ld1sb { z11.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x449e4680  // smlalt z0.s, p4/M, z20.h, z30.h\n"
+    "ld1sb { z20.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x44844621  // smlalt z1.s, p4/M, z17.h, z4.h\n"
+    "ldr x21, [x11, #0x70]\n"
+    ".inst 0x448a47a6  // smlalt z6.s, p4/M, z29.h, z10.h\n"
+    "ldr x20, [x11, #0x98]\n"
+    ".inst 0x448e4372  // smlalb z18.s, p4/M, z27.h, z14.h\n"
+    "ldr x23, [x11, #0x50]\n"
+    ".inst 0x449942cf  // smlalb z15.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x449e42a5  // smlalb z5.s, p4/M, z21.h, z30.h\n"
+    ".inst 0x455a116b  // ssublb z11.h, z11.b, z26.b\n"
+    "ld1sb { z17.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x44844798  // smlalt z24.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x455a1294  // ssublb z20.h, z20.b, z26.b\n"
+    ".inst 0x448a4208  // smlalb z8.s, p4/M, z16.h, z10.h\n"
+    "ld1sb { z29.h }, p3/Z, [x21, x7]\n"
+    "ld1sb { z28.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x448e4760  // smlalt z0.s, p4/M, z27.h, z14.h\n"
+    "ldr x22, [x11, #0x48]\n"
+    ".inst 0x449946c1  // smlalt z1.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x449e46a6  // smlalt z6.s, p4/M, z21.h, z30.h\n"
+    "ldr x21, [x11, #0x90]\n"
+    "ldr x20, [x11, #0xa8]\n"
+    ".inst 0x449943f2  // smlalb z18.s, p4/M, z31.h, z25.h\n"
+    "ld1sb { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x448a416f  // smlalb z15.s, p4/M, z11.h, z10.h\n"
+    ".inst 0x44834285  // smlalb z5.s, p4/M, z20.h, z3.h\n"
+    ".inst 0x455a1231  // ssublb z17.h, z17.b, z26.b\n"
+    ".inst 0x448a4618  // smlalt z24.s, p4/M, z16.h, z10.h\n"
+    ".inst 0x455a13bd  // ssublb z29.h, z29.b, z26.b\n"
+    ".inst 0x448e43e8  // smlalb z8.s, p4/M, z31.h, z14.h\n"
+    "ld1sb { z16.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x455a139c  // ssublb z28.h, z28.b, z26.b\n"
+    ".inst 0x449947e0  // smlalt z0.s, p4/M, z31.h, z25.h\n"
+    "ld1sb { z25.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x448a4561  // smlalt z1.s, p4/M, z11.h, z10.h\n"
+    "ld1sb { z11.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x455a137b  // ssublb z27.h, z27.b, z26.b\n"
+    ".inst 0x44834686  // smlalt z6.s, p4/M, z20.h, z3.h\n"
+    "ldr x21, [x11, #0xa0]\n"
+    "ldr x20, [x11, #0xb0]\n"
+    ".inst 0x448a4232  // smlalb z18.s, p4/M, z17.h, z10.h\n"
+    ".inst 0x449e43af  // smlalb z15.s, p4/M, z29.h, z30.h\n"
+    ".inst 0x455a1210  // ssublb z16.h, z16.b, z26.b\n"
+    ".inst 0x448e4385  // smlalb z5.s, p4/M, z28.h, z14.h\n"
+    ".inst 0x448e47f8  // smlalt z24.s, p4/M, z31.h, z14.h\n"
+    ".inst 0x455a1339  // ssublb z25.h, z25.b, z26.b\n"
+    "ld1sb { z20.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x455a116b  // ssublb z11.h, z11.b, z26.b\n"
+    ".inst 0x44834368  // smlalb z8.s, p4/M, z27.h, z3.h\n"
+    "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x448a4620  // smlalt z0.s, p4/M, z17.h, z10.h\n"
+    ".inst 0x449e47a1  // smlalt z1.s, p4/M, z29.h, z30.h\n"
+    ".inst 0x448e4786  // smlalt z6.s, p4/M, z28.h, z14.h\n"
+    "ldr x20, [x11, #0xb8]\n"
+    ".inst 0x455a1294  // ssublb z20.h, z20.b, z26.b\n"
+    ".inst 0x44834212  // smlalb z18.s, p4/M, z16.h, z3.h\n"
+    ".inst 0x4497432f  // smlalb z15.s, p4/M, z25.h, z23.h\n"
+    ".inst 0x455a13ff  // ssublb z31.h, z31.b, z26.b\n"
+    "ld1sb { z30.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x44844165  // smlalb z5.s, p4/M, z11.h, z4.h\n"
+    ".inst 0x44834778  // smlalt z24.s, p4/M, z27.h, z3.h\n"
+    "ldr x20, [x11, #0xc0]\n"
+    "ld1w { z17.s }, p2/Z, [x9]\n"
+    ".inst 0x449742c8  // smlalb z8.s, p4/M, z22.h, z23.h\n"
+    ".inst 0x44834600  // smlalt z0.s, p4/M, z16.h, z3.h\n"
+    "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
+    ".inst 0x455a13de  // ssublb z30.h, z30.b, z26.b\n"
+    ".inst 0x44974721  // smlalt z1.s, p4/M, z25.h, z23.h\n"
+    ".inst 0x44844566  // smlalt z6.s, p4/M, z11.h, z4.h\n"
+    "ld1sb { z25.h }, p3/Z, [x20, x7]\n"
+    "uzp1 z10.s, z17.s, z14.s\n"
+    ".inst 0x44844372  // smlalb z18.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x4487428f  // smlalb z15.s, p4/M, z20.h, z7.h\n"
+    "uzp2 z14.s, z17.s, z14.s\n"
+    "ld1w { z17.s }, p2/Z, [x28]\n"
+    ".inst 0x448743e5  // smlalb z5.s, p4/M, z31.h, z7.h\n"
+    ".inst 0x449746d8  // smlalt z24.s, p4/M, z22.h, z23.h\n"
+    "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+    ".inst 0x455a1339  // ssublb z25.h, z25.b, z26.b\n"
+    ".inst 0x448743a8  // smlalb z8.s, p4/M, z29.h, z7.h\n"
+    ".inst 0x44844760  // smlalt z0.s, p4/M, z27.h, z4.h\n"
+    "uzp1 z4.s, z17.s, z16.s\n"
+    "inch x7\n"
+    ".inst 0x44874681  // smlalt z1.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x448747e6  // smlalt z6.s, p4/M, z31.h, z7.h\n"
+    ".inst 0x04aa7508  // sqrdmulh z8.s, z8.s, z10.s\n"
+    "whilelt p0.h, x10, x8\n"
+    ".inst 0x448742b2  // smlalb z18.s, p4/M, z21.h, z7.h\n"
+    ".inst 0x4483416f  // smlalb z15.s, p4/M, z11.h, z3.h\n"
+    "uzp2 z22.s, z17.s, z16.s\n"
+    "mov x20, x7\n"
+    ".inst 0x449743c5  // smlalb z5.s, p4/M, z30.h, z23.h\n"
+    ".inst 0x448747b8  // smlalt z24.s, p4/M, z29.h, z7.h\n"
+    "and z17.d, z8.d, z4.d\n"
+    "inch x17\n"
+    ".inst 0x448746a0  // smlalt z0.s, p4/M, z21.h, z7.h\n"
+    ".inst 0x44834561  // smlalt z1.s, p4/M, z11.h, z3.h\n"
+    ".inst 0x04ae7718  // sqrdmulh z24.s, z24.s, z14.s\n"
+    "incw x20\n"
+    ".inst 0x449747c6  // smlalt z6.s, p4/M, z30.h, z23.h\n"
+    ".inst 0x44824392  // smlalb z18.s, p4/M, z28.h, z2.h\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "whilelt p2.s, x7, x8\n"
+    ".inst 0x448243cf  // smlalb z15.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x44824325  // smlalb z5.s, p4/M, z25.h, z2.h\n"
+    "and z16.d, z24.d, z22.d\n"
+    "whilelt p1.s, x20, x8\n"
+    ".inst 0x44824780  // smlalt z0.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x448247c1  // smlalt z1.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x04aa7652  // sqrdmulh z18.s, z18.s, z10.s\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44824726  // smlalt z6.s, p4/M, z25.h, z2.h\n"
+    ".inst 0x04aa75ef  // sqrdmulh z15.s, z15.s, z10.s\n"
+    "whilelt p3.h, x7, x8\n"
+    "addvl x9, x9, #2\n"
+    ".inst 0x04aa74a5  // sqrdmulh z5.s, z5.s, z10.s\n"
+    "sqadd z8.s, z8.s, z17.s\n"
+    ".inst 0x44829088  // srshl z8.s, p4/M, z8.s, z4.s\n"
+    "addvl x28, x28, #2\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "and z21.d, z18.d, z4.d\n"
+    ".inst 0x04ae7400  // sqrdmulh z0.s, z0.s, z14.s\n"
+    "and z20.d, z15.d, z4.d\n"
+    ".inst 0x04ae7421  // sqrdmulh z1.s, z1.s, z14.s\n"
+    "and z28.d, z5.d, z4.d\n"
+    ".inst 0x04ae74c6  // sqrdmulh z6.s, z6.s, z14.s\n"
+    "sqadd z24.s, z24.s, z16.s\n"
+    ".inst 0x448292d8  // srshl z24.s, p4/M, z24.s, z22.s\n"
+    "asr z21.s, z21.s, #0x1f\n"
+    "and z25.d, z0.d, z22.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "and z17.d, z1.d, z22.d\n"
+    "asr z28.s, z28.s, #0x1f\n"
+    "and z16.d, z6.d, z22.d\n"
+    "sqadd z18.s, z18.s, z21.s\n"
+    "asr z25.s, z25.s, #0x1f\n"
+    ".inst 0x44829092  // srshl z18.s, p4/M, z18.s, z4.s\n"
+    "sqadd z15.s, z15.s, z20.s\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x4482908f  // srshl z15.s, p4/M, z15.s, z4.s\n"
+    "sqadd z5.s, z5.s, z28.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x44829085  // srshl z5.s, p4/M, z5.s, z4.s\n"
+    "sqadd z0.s, z0.s, z25.s\n"
+    "sqadd z1.s, z1.s, z17.s\n"
+    ".inst 0x448292c0  // srshl z0.s, p4/M, z0.s, z22.s\n"
+    ".inst 0x448292c1  // srshl z1.s, p4/M, z1.s, z22.s\n"
+    "sqadd z6.s, z6.s, z16.s\n"
+    ".inst 0x45304108  // sqxtnb z8.h, z8.s\n"
+    ".inst 0x448292c6  // srshl z6.s, p4/M, z6.s, z22.s\n"
+    ".inst 0x45304252  // sqxtnb z18.h, z18.s\n"
+    ".inst 0x453041ef  // sqxtnb z15.h, z15.s\n"
+    ".inst 0x453040a5  // sqxtnb z5.h, z5.s\n"
+    ".inst 0x45304708  // sqxtnt z8.h, z24.s\n"
+    ".inst 0x45304412  // sqxtnt z18.h, z0.s\n"
+    ".inst 0x4530442f  // sqxtnt z15.h, z1.s\n"
+    ".inst 0x453044c5  // sqxtnt z5.h, z6.s\n"
+    "sqadd z8.h, z8.h, z19.h\n"
+    "smax z8.h, p4/M, z8.h, z12.h\n"
+    "smin z8.h, p4/M, z8.h, z9.h\n"
+    "sqadd z18.h, z18.h, z19.h\n"
+    "sqadd z15.h, z15.h, z19.h\n"
+    "smax z18.h, p4/M, z18.h, z12.h\n"
+    "smax z15.h, p4/M, z15.h, z12.h\n"
+    "sqadd z5.h, z5.h, z19.h\n"
+    "smax z5.h, p4/M, z5.h, z12.h\n"
+    "smin z18.h, p4/M, z18.h, z9.h\n"
+    "st1b { z8.h }, p0, [x16, x10]\n"
+    "smin z15.h, p4/M, z15.h, z9.h\n"
+    "smin z5.h, p4/M, z5.h, z9.h\n"
+    "st1b { z18.h }, p0, [x15, x10]\n"
+    "st1b { z15.h }, p0, [x14, x10]\n"
+    "st1b { z5.h }, p0, [x13, x10]\n"
+    "ld1sb { z25.h }, p4/Z, [x17]\n"
+    "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+    "inch x10\n"
+    "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+    "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+    ".inst 0x454d1339  // ssublb z25.h, z25.b, z13.b\n"
+    ".inst 0x454d13de  // ssublb z30.h, z30.b, z13.b\n"
+    "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+    "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+    ".inst 0x454d11ce  // ssublb z14.h, z14.b, z13.b\n"
+    ".inst 0x454d1084  // ssublb z4.h, z4.b, z13.b\n"
+    "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+    "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+    "inch x17, ALL, MUL #8\n"
+    ".inst 0x454d114a  // ssublb z10.h, z10.b, z13.b\n"
+    "ld1w { z17.s }, p2/Z, [x20]\n"
+    "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+    "uzp1 z8.s, z17.s, z16.s\n"
+    "uzp2 z24.s, z17.s, z16.s\n"
+    "ld1sb { z2.h }, p4/Z, [x17]\n"
+    "ldp x27, x26, [x11, #0x0]\n"
+    "addvl x20, x20, #2\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x25, x24, [x11, #0x10]\n"
+    "ldp x23, x22, [x11, #0x20]\n"
+    "mov z18.d, z8.d\n"
+    "mov z0.d, z24.d\n"
+    "ldp x21, x20, [x11, #0x30]\n"
+    "ld1sb { z21.h }, p3/Z, [x27, x7]\n"
+    "mov z15.d, z8.d\n"
+    "mov z1.d, z24.d\n"
+    "ld1sb { z22.h }, p3/Z, [x26, x7]\n"
+    "ld1sb { z11.h }, p3/Z, [x25, x7]\n"
+    "mov z5.d, z8.d\n"
+    "mov z6.d, z24.d\n"
+    "ld1sb { z20.h }, p3/Z, [x24, x7]\n"
+    "ld1sb { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x454d1063  // ssublb z3.h, z3.b, z13.b\n"
+    ".inst 0x454d12f7  // ssublb z23.h, z23.b, z13.b\n"
+    "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
+    "ld1sb { z16.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x454d10e7  // ssublb z7.h, z7.b, z13.b\n"
+    ".inst 0x454d1042  // ssublb z2.h, z2.b, z13.b\n"
+    "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x455a12b5  // ssublb z21.h, z21.b, z26.b\n"
+    ".inst 0x455a12d6  // ssublb z22.h, z22.b, z26.b\n"
+    ".inst 0x455a116b  // ssublb z11.h, z11.b, z26.b\n"
+    ".inst 0x455a1294  // ssublb z20.h, z20.b, z26.b\n"
+    ".inst 0x455a137b  // ssublb z27.h, z27.b, z26.b\n"
+    ".inst 0x455a139c  // ssublb z28.h, z28.b, z26.b\n"
+    ".inst 0x455a1210  // ssublb z16.h, z16.b, z26.b\n"
+    ".inst 0x455a13ff  // ssublb z31.h, z31.b, z26.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..1f8d6c5213
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+class sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..7ff724ddd8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x2, #0x0\n"
+    "mov x24, x2\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ptrue p4.b\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "incw x24\n"
+    "ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x21, x23, %[offsetof_Requantize32_a_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+    "ld1rb { z30.b }, p4/Z, [x21]\n"
+    "ld1rb { z10.b }, p4/Z, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_minval]\n"
+    "ld1rh { z15.h }, p4/Z, [x21]\n"
+    "ld1rh { z12.h }, p4/Z, [x20]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z13.h }, p4/Z, [x20]\n"
+    "ldp x5, x6, [x22, #0x0]\n"
+    "whilelt p3.h, x2, x3\n"
+    "ldp x7, x8, [x22, #0x10]\n"
+    "whilelt p2.s, x2, x3\n"
+    "whilelt p1.s, x24, x3\n"
+    "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
+    "add x17, %x[params], %[offsetof_Params_inptrs]\n"
+    "ld1w { z17.s }, p2/Z, [x10]\n"
+    "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+    "uzp1 z14.s, z17.s, z16.s\n"
+    "ld1sb { z26.h }, p4/Z, [x4]\n"
+    "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+    "uzp2 z23.s, z17.s, z16.s\n"
+    "addvl x10, x10, #2\n"
+    "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+    "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+    "mov x16, #0x0\n"
+    "mov z6.d, z14.d\n"
+    "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+    "ldp x9, x28, [x17, #0x0]\n"
+    "mov z18.d, z23.d\n"
+    "mov z9.d, z14.d\n"
+    "ldp x27, x26, [x17, #0x10]\n"
+    "ldp x25, x24, [x17, #0x20]\n"
+    "mov z20.d, z23.d\n"
+    "mov z7.d, z14.d\n"
+    "ldp x23, x22, [x17, #0x30]\n"
+    "ldp x21, x20, [x17, #0x40]\n"
+    "mov z1.d, z23.d\n"
+    ".inst 0x454a135a  // ssublb z26.h, z26.b, z10.b\n"
+    "ld1sb { z22.h }, p3/Z, [x9, x2]\n"
+    "ld1sb { z2.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x454a1108  // ssublb z8.h, z8.b, z10.b\n"
+    ".inst 0x454a1210  // ssublb z16.h, z16.b, z10.b\n"
+    "ld1sb { z11.h }, p3/Z, [x27, x2]\n"
+    "ld1sb { z3.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    ".inst 0x454a1231  // ssublb z17.h, z17.b, z10.b\n"
+    "ld1sb { z29.h }, p3/Z, [x25, x2]\n"
+    "ld1sb { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x455e12d6  // ssublb z22.h, z22.b, z30.b\n"
+    ".inst 0x455e1042  // ssublb z2.h, z2.b, z30.b\n"
+    "ld1sb { z31.h }, p3/Z, [x23, x2]\n"
+    "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e116b  // ssublb z11.h, z11.b, z30.b\n"
+    ".inst 0x455e1063  // ssublb z3.h, z3.b, z30.b\n"
+    "ld1sb { z19.h }, p3/Z, [x21, x2]\n"
+    "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e13bd  // ssublb z29.h, z29.b, z30.b\n"
+    ".inst 0x455e1084  // ssublb z4.h, z4.b, z30.b\n"
+    "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x10, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455e13ff  // ssublb z31.h, z31.b, z30.b\n"
+    ".inst 0x455e1000  // ssublb z0.h, z0.b, z30.b\n"
+    ".inst 0x455e1273  // ssublb z19.h, z19.b, z30.b\n"
+    ".inst 0x455e139c  // ssublb z28.h, z28.b, z30.b\n"
+    "1:"  // Loop
+    ".inst 0x449a42ce  // smlalb z14.s, p4/M, z22.h, z26.h\n"
+    ".inst 0x449a46d7  // smlalt z23.s, p4/M, z22.h, z26.h\n"
+    "ldr x20, [x17, #0x50]\n"
+    "ld1sb { z27.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x4488404e  // smlalb z14.s, p4/M, z2.h, z8.h\n"
+    ".inst 0x449a4046  // smlalb z6.s, p4/M, z2.h, z26.h\n"
+    "ldr x20, [x17, #0x58]\n"
+    ".inst 0x455e137b  // ssublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449a4169  // smlalb z9.s, p4/M, z11.h, z26.h\n"
+    ".inst 0x449a4067  // smlalb z7.s, p4/M, z3.h, z26.h\n"
+    "ld1sb { z5.h }, p3/Z, [x20, x2]\n"
+    "ldr x20, [x17, #0x60]\n"
+    ".inst 0x44884457  // smlalt z23.s, p4/M, z2.h, z8.h\n"
+    ".inst 0x449043ae  // smlalb z14.s, p4/M, z29.h, z16.h\n"
+    "ld1sb { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x455e10a5  // ssublb z5.h, z5.b, z30.b\n"
+    ".inst 0x449a4452  // smlalt z18.s, p4/M, z2.h, z26.h\n"
+    ".inst 0x449a4574  // smlalt z20.s, p4/M, z11.h, z26.h\n"
+    "ld1sb { z22.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x454a1339  // ssublb z25.h, z25.b, z10.b\n"
+    ".inst 0x449a4461  // smlalt z1.s, p4/M, z3.h, z26.h\n"
+    ".inst 0x448843a6  // smlalb z6.s, p4/M, z29.h, z8.h\n"
+    "ldr x20, [x17, #0x68]\n"
+    "ld1sb { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x44884069  // smlalb z9.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x44884087  // smlalb z7.s, p4/M, z4.h, z8.h\n"
+    ".inst 0x455e12d6  // ssublb z22.h, z22.b, z30.b\n"
+    "ld1sb { z26.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x449047b7  // smlalt z23.s, p4/M, z29.h, z16.h\n"
+    ".inst 0x449543ee  // smlalb z14.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454a1042  // ssublb z2.h, z2.b, z10.b\n"
+    "ldr x20, [x17, #0x70]\n"
+    ".inst 0x448847b2  // smlalt z18.s, p4/M, z29.h, z8.h\n"
+    ".inst 0x44884474  // smlalt z20.s, p4/M, z3.h, z8.h\n"
+    "ld1sb { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
+    ".inst 0x455e135a  // ssublb z26.h, z26.b, z30.b\n"
+    ".inst 0x44884481  // smlalt z1.s, p4/M, z4.h, z8.h\n"
+    ".inst 0x449043e6  // smlalb z6.s, p4/M, z31.h, z16.h\n"
+    "inch x4, ALL, MUL #8\n"
+    "ld1sb { z8.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x44904089  // smlalb z9.s, p4/M, z4.h, z16.h\n"
+    ".inst 0x44904367  // smlalb z7.s, p4/M, z27.h, z16.h\n"
+    ".inst 0x454a13bd  // ssublb z29.h, z29.b, z10.b\n"
+    "ldr x20, [x17, #0x78]\n"
+    ".inst 0x449547f7  // smlalt z23.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x4491400e  // smlalb z14.s, p4/M, z0.h, z17.h\n"
+    "ld1sb { z24.h }, p4/Z, [x4]\n"
+    ".inst 0x455e1108  // ssublb z8.h, z8.b, z30.b\n"
+    ".inst 0x449047f2  // smlalt z18.s, p4/M, z31.h, z16.h\n"
+    ".inst 0x44904494  // smlalt z20.s, p4/M, z4.h, z16.h\n"
+    "ld1sb { z31.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x454a1318  // ssublb z24.h, z24.b, z10.b\n"
+    ".inst 0x44904761  // smlalt z1.s, p4/M, z27.h, z16.h\n"
+    ".inst 0x44954006  // smlalb z6.s, p4/M, z0.h, z21.h\n"
+    "ldr x22, [x17, #0x80]\n"
+    "ld1sb { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
+    ".inst 0x44954369  // smlalb z9.s, p4/M, z27.h, z21.h\n"
+    ".inst 0x449540a7  // smlalb z7.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x455e13ff  // ssublb z31.h, z31.b, z30.b\n"
+    "ldr x21, [x17, #0x88]\n"
+    ".inst 0x44914417  // smlalt z23.s, p4/M, z0.h, z17.h\n"
+    ".inst 0x4499416e  // smlalb z14.s, p4/M, z11.h, z25.h\n"
+    ".inst 0x454a1210  // ssublb z16.h, z16.b, z10.b\n"
+    "ldr x20, [x17, #0x90]\n"
+    ".inst 0x44954412  // smlalt z18.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x44954774  // smlalt z20.s, p4/M, z27.h, z21.h\n"
+    "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1000  // ssublb z0.h, z0.b, z30.b\n"
+    ".inst 0x449544a1  // smlalt z1.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x449142c6  // smlalb z6.s, p4/M, z22.h, z17.h\n"
+    "ld1sb { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    ".inst 0x449140a9  // smlalb z9.s, p4/M, z5.h, z17.h\n"
+    ".inst 0x44914267  // smlalb z7.s, p4/M, z19.h, z17.h\n"
+    "ldr x23, [x17, #0x98]\n"
+    "ldr x22, [x17, #0xa0]\n"
+    ".inst 0x44994577  // smlalt z23.s, p4/M, z11.h, z25.h\n"
+    ".inst 0x4482406e  // smlalb z14.s, p4/M, z3.h, z2.h\n"
+    "ld1sb { z11.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x455e116b  // ssublb z11.h, z11.b, z30.b\n"
+    ".inst 0x449146d2  // smlalt z18.s, p4/M, z22.h, z17.h\n"
+    ".inst 0x449144b4  // smlalt z20.s, p4/M, z5.h, z17.h\n"
+    "ld1sb { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
+    ".inst 0x454a12d6  // ssublb z22.h, z22.b, z10.b\n"
+    ".inst 0x44914661  // smlalt z1.s, p4/M, z19.h, z17.h\n"
+    ".inst 0x44994066  // smlalb z6.s, p4/M, z3.h, z25.h\n"
+    "ld1sb { z17.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1231  // ssublb z17.h, z17.b, z30.b\n"
+    ".inst 0x44994389  // smlalb z9.s, p4/M, z28.h, z25.h\n"
+    ".inst 0x44994347  // smlalb z7.s, p4/M, z26.h, z25.h\n"
+    "ldr x20, [x17, #0xa8]\n"
+    "ldr x21, [x17, #0xb0]\n"
+    ".inst 0x44824477  // smlalt z23.s, p4/M, z3.h, z2.h\n"
+    ".inst 0x449d408e  // smlalb z14.s, p4/M, z4.h, z29.h\n"
+    "ldr x13, [x17, #0xb8]\n"
+    "ldr x12, [x17, #0xc0]\n"
+    ".inst 0x44994472  // smlalt z18.s, p4/M, z3.h, z25.h\n"
+    ".inst 0x44994794  // smlalt z20.s, p4/M, z28.h, z25.h\n"
+    "ld1sb { z3.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x455e1063  // ssublb z3.h, z3.b, z30.b\n"
+    ".inst 0x44994741  // smlalt z1.s, p4/M, z26.h, z25.h\n"
+    ".inst 0x44824086  // smlalb z6.s, p4/M, z4.h, z2.h\n"
+    "ld1sb { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
+    ".inst 0x454a1339  // ssublb z25.h, z25.b, z10.b\n"
+    ".inst 0x44824349  // smlalb z9.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x44824107  // smlalb z7.s, p4/M, z8.h, z2.h\n"
+    "ldr x11, [x17, #0xc8]\n"
+    "ldr x10, [x17, #0xd0]\n"
+    ".inst 0x449d4497  // smlalt z23.s, p4/M, z4.h, z29.h\n"
+    ".inst 0x4498436e  // smlalb z14.s, p4/M, z27.h, z24.h\n"
+    "ldr x9, [x17, #0xd8]\n"
+    "ldr x28, [x17, #0xe0]\n"
+    ".inst 0x44824492  // smlalt z18.s, p4/M, z4.h, z2.h\n"
+    ".inst 0x44824754  // smlalt z20.s, p4/M, z26.h, z2.h\n"
+    "ld1sb { z4.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1084  // ssublb z4.h, z4.b, z30.b\n"
+    ".inst 0x44824501  // smlalt z1.s, p4/M, z8.h, z2.h\n"
+    ".inst 0x449d4366  // smlalb z6.s, p4/M, z27.h, z29.h\n"
+    "ld1sb { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x454a1042  // ssublb z2.h, z2.b, z10.b\n"
+    ".inst 0x449d4109  // smlalb z9.s, p4/M, z8.h, z29.h\n"
+    ".inst 0x449d43e7  // smlalb z7.s, p4/M, z31.h, z29.h\n"
+    "ldr x27, [x17, #0xe8]\n"
+    "ldr x26, [x17, #0xf0]\n"
+    ".inst 0x44984777  // smlalt z23.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x449040ae  // smlalb z14.s, p4/M, z5.h, z16.h\n"
+    "ldr x25, [x17, #0xf8]\n"
+    "ldr x24, [x17, #0x100]\n"
+    ".inst 0x449d4772  // smlalt z18.s, p4/M, z27.h, z29.h\n"
+    ".inst 0x449d4514  // smlalt z20.s, p4/M, z8.h, z29.h\n"
+    "ld1sb { z27.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e137b  // ssublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449d47e1  // smlalt z1.s, p4/M, z31.h, z29.h\n"
+    ".inst 0x449840a6  // smlalb z6.s, p4/M, z5.h, z24.h\n"
+    "ld1sb { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x454a13bd  // ssublb z29.h, z29.b, z10.b\n"
+    ".inst 0x449843e9  // smlalb z9.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984007  // smlalb z7.s, p4/M, z0.h, z24.h\n"
+    "ldr x23, [x17, #0x108]\n"
+    "ldr x22, [x17, #0x110]\n"
+    ".inst 0x449044b7  // smlalt z23.s, p4/M, z5.h, z16.h\n"
+    ".inst 0x4495438e  // smlalb z14.s, p4/M, z28.h, z21.h\n"
+    "ldr x20, [x17, #0x118]\n"
+    "whilelt p0.h, x16, x3\n"
+    ".inst 0x449844b2  // smlalt z18.s, p4/M, z5.h, z24.h\n"
+    ".inst 0x449847f4  // smlalt z20.s, p4/M, z31.h, z24.h\n"
+    "ld1sb { z5.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x455e10a5  // ssublb z5.h, z5.b, z30.b\n"
+    ".inst 0x44984401  // smlalt z1.s, p4/M, z0.h, z24.h\n"
+    ".inst 0x44904266  // smlalb z6.s, p4/M, z19.h, z16.h\n"
+    "ld1sb { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
+    "inch x4, ALL, MUL #8\n"
+    ".inst 0x44904009  // smlalb z9.s, p4/M, z0.h, z16.h\n"
+    ".inst 0x44904167  // smlalb z7.s, p4/M, z11.h, z16.h\n"
+    ".inst 0x454a1318  // ssublb z24.h, z24.b, z10.b\n"
+    "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44954797  // smlalt z23.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x4496434e  // smlalb z14.s, p4/M, z26.h, z22.h\n"
+    "ld1sb { z28.h }, p3/Z, [x13, x2]\n"
+    ".inst 0x455e139c  // ssublb z28.h, z28.b, z30.b\n"
+    ".inst 0x44904672  // smlalt z18.s, p4/M, z19.h, z16.h\n"
+    ".inst 0x44904414  // smlalt z20.s, p4/M, z0.h, z16.h\n"
+    "ld1sb { z19.h }, p4/Z, [x4]\n"
+    ".inst 0x454a1273  // ssublb z19.h, z19.b, z10.b\n"
+    ".inst 0x44904561  // smlalt z1.s, p4/M, z11.h, z16.h\n"
+    ".inst 0x44954346  // smlalb z6.s, p4/M, z26.h, z21.h\n"
+    "ld1sb { z16.h }, p3/Z, [x12, x2]\n"
+    ".inst 0x455e1210  // ssublb z16.h, z16.b, z30.b\n"
+    ".inst 0x44954229  // smlalb z9.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x44954067  // smlalb z7.s, p4/M, z3.h, z21.h\n"
+    ".inst 0x44964757  // smlalt z23.s, p4/M, z26.h, z22.h\n"
+    ".inst 0x4499410e  // smlalb z14.s, p4/M, z8.h, z25.h\n"
+    ".inst 0x44954752  // smlalt z18.s, p4/M, z26.h, z21.h\n"
+    ".inst 0x44954634  // smlalt z20.s, p4/M, z17.h, z21.h\n"
+    "ld1sb { z26.h }, p3/Z, [x11, x2]\n"
+    ".inst 0x455e135a  // ssublb z26.h, z26.b, z30.b\n"
+    ".inst 0x44954461  // smlalt z1.s, p4/M, z3.h, z21.h\n"
+    ".inst 0x44964106  // smlalb z6.s, p4/M, z8.h, z22.h\n"
+    "ld1sb { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    ".inst 0x44964069  // smlalb z9.s, p4/M, z3.h, z22.h\n"
+    ".inst 0x44964087  // smlalb z7.s, p4/M, z4.h, z22.h\n"
+    ".inst 0x44994517  // smlalt z23.s, p4/M, z8.h, z25.h\n"
+    ".inst 0x448243ee  // smlalb z14.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x44964512  // smlalt z18.s, p4/M, z8.h, z22.h\n"
+    ".inst 0x44964474  // smlalt z20.s, p4/M, z3.h, z22.h\n"
+    "ld1sb { z8.h }, p3/Z, [x10, x2]\n"
+    ".inst 0x455e1108  // ssublb z8.h, z8.b, z30.b\n"
+    ".inst 0x44964481  // smlalt z1.s, p4/M, z4.h, z22.h\n"
+    ".inst 0x449943e6  // smlalb z6.s, p4/M, z31.h, z25.h\n"
+    "ld1sb { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
+    ".inst 0x454a12d6  // ssublb z22.h, z22.b, z10.b\n"
+    ".inst 0x44994089  // smlalb z9.s, p4/M, z4.h, z25.h\n"
+    ".inst 0x44994367  // smlalb z7.s, p4/M, z27.h, z25.h\n"
+    ".inst 0x448247f7  // smlalt z23.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x449d400e  // smlalb z14.s, p4/M, z0.h, z29.h\n"
+    ".inst 0x449947f2  // smlalt z18.s, p4/M, z31.h, z25.h\n"
+    ".inst 0x44994494  // smlalt z20.s, p4/M, z4.h, z25.h\n"
+    "ld1sb { z31.h }, p3/Z, [x9, x2]\n"
+    ".inst 0x455e13ff  // ssublb z31.h, z31.b, z30.b\n"
+    ".inst 0x44994761  // smlalt z1.s, p4/M, z27.h, z25.h\n"
+    ".inst 0x44824006  // smlalb z6.s, p4/M, z0.h, z2.h\n"
+    "ld1sb { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
+    ".inst 0x454a1339  // ssublb z25.h, z25.b, z10.b\n"
+    ".inst 0x44824369  // smlalb z9.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x448240a7  // smlalb z7.s, p4/M, z5.h, z2.h\n"
+    ".inst 0x449d4417  // smlalt z23.s, p4/M, z0.h, z29.h\n"
+    ".inst 0x4498422e  // smlalb z14.s, p4/M, z17.h, z24.h\n"
+    ".inst 0x44824412  // smlalt z18.s, p4/M, z0.h, z2.h\n"
+    ".inst 0x44824774  // smlalt z20.s, p4/M, z27.h, z2.h\n"
+    "ld1sb { z0.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x455e1000  // ssublb z0.h, z0.b, z30.b\n"
+    ".inst 0x448244a1  // smlalt z1.s, p4/M, z5.h, z2.h\n"
+    ".inst 0x449d4166  // smlalb z6.s, p4/M, z11.h, z29.h\n"
+    "ld1sb { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
+    ".inst 0x454a1042  // ssublb z2.h, z2.b, z10.b\n"
+    ".inst 0x449d40a9  // smlalb z9.s, p4/M, z5.h, z29.h\n"
+    ".inst 0x449d4387  // smlalb z7.s, p4/M, z28.h, z29.h\n"
+    ".inst 0x44984637  // smlalt z23.s, p4/M, z17.h, z24.h\n"
+    ".inst 0x4493406e  // smlalb z14.s, p4/M, z3.h, z19.h\n"
+    "ld1sb { z17.h }, p3/Z, [x27, x2]\n"
+    ".inst 0x455e1231  // ssublb z17.h, z17.b, z30.b\n"
+    ".inst 0x449d4572  // smlalt z18.s, p4/M, z11.h, z29.h\n"
+    ".inst 0x449d44b4  // smlalt z20.s, p4/M, z5.h, z29.h\n"
+    "ld1sb { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x454a116b  // ssublb z11.h, z11.b, z10.b\n"
+    ".inst 0x449d4781  // smlalt z1.s, p4/M, z28.h, z29.h\n"
+    ".inst 0x44984066  // smlalb z6.s, p4/M, z3.h, z24.h\n"
+    "ld1sb { z29.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x455e13bd  // ssublb z29.h, z29.b, z30.b\n"
+    ".inst 0x44984209  // smlalb z9.s, p4/M, z16.h, z24.h\n"
+    ".inst 0x44984347  // smlalb z7.s, p4/M, z26.h, z24.h\n"
+    ".inst 0x44934477  // smlalt z23.s, p4/M, z3.h, z19.h\n"
+    ".inst 0x4495408e  // smlalb z14.s, p4/M, z4.h, z21.h\n"
+    ".inst 0x44984472  // smlalt z18.s, p4/M, z3.h, z24.h\n"
+    ".inst 0x44984614  // smlalt z20.s, p4/M, z16.h, z24.h\n"
+    "ld1sb { z3.h }, p3/Z, [x25, x2]\n"
+    ".inst 0x455e1063  // ssublb z3.h, z3.b, z30.b\n"
+    ".inst 0x44984741  // smlalt z1.s, p4/M, z26.h, z24.h\n"
+    ".inst 0x44934086  // smlalb z6.s, p4/M, z4.h, z19.h\n"
+    "ld1sb { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x454a1318  // ssublb z24.h, z24.b, z10.b\n"
+    ".inst 0x44934349  // smlalb z9.s, p4/M, z26.h, z19.h\n"
+    ".inst 0x44934107  // smlalb z7.s, p4/M, z8.h, z19.h\n"
+    ".inst 0x44954497  // smlalt z23.s, p4/M, z4.h, z21.h\n"
+    ".inst 0x4496436e  // smlalb z14.s, p4/M, z27.h, z22.h\n"
+    ".inst 0x44934492  // smlalt z18.s, p4/M, z4.h, z19.h\n"
+    ".inst 0x44934754  // smlalt z20.s, p4/M, z26.h, z19.h\n"
+    "ld1sb { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x455e1084  // ssublb z4.h, z4.b, z30.b\n"
+    ".inst 0x44934501  // smlalt z1.s, p4/M, z8.h, z19.h\n"
+    ".inst 0x44954366  // smlalb z6.s, p4/M, z27.h, z21.h\n"
+    "ld1sb { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+    "inch x4, ALL, MUL #8\n"
+    ".inst 0x44954109  // smlalb z9.s, p4/M, z8.h, z21.h\n"
+    ".inst 0x449543e7  // smlalb z7.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454a1273  // ssublb z19.h, z19.b, z10.b\n"
+    ".inst 0x44964777  // smlalt z23.s, p4/M, z27.h, z22.h\n"
+    ".inst 0x449940ae  // smlalb z14.s, p4/M, z5.h, z25.h\n"
+    ".inst 0x44954772  // smlalt z18.s, p4/M, z27.h, z21.h\n"
+    ".inst 0x44954514  // smlalt z20.s, p4/M, z8.h, z21.h\n"
+    "ld1sb { z27.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x455e137b  // ssublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449547e1  // smlalt z1.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x449640a6  // smlalb z6.s, p4/M, z5.h, z22.h\n"
+    "ld1sb { z21.h }, p4/Z, [x4]\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    ".inst 0x449643e9  // smlalb z9.s, p4/M, z31.h, z22.h\n"
+    ".inst 0x44964007  // smlalb z7.s, p4/M, z0.h, z22.h\n"
+    "inch x4\n"
+    ".inst 0x449944b7  // smlalt z23.s, p4/M, z5.h, z25.h\n"
+    ".inst 0x4482420e  // smlalb z14.s, p4/M, z16.h, z2.h\n"
+    ".inst 0x449644b2  // smlalt z18.s, p4/M, z5.h, z22.h\n"
+    ".inst 0x449647f4  // smlalt z20.s, p4/M, z31.h, z22.h\n"
+    "ld1sb { z5.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e10a5  // ssublb z5.h, z5.b, z30.b\n"
+    ".inst 0x44964401  // smlalt z1.s, p4/M, z0.h, z22.h\n"
+    ".inst 0x44994386  // smlalb z6.s, p4/M, z28.h, z25.h\n"
+    "ld1w { z22.s }, p2/Z, [x15]\n"
+    ".inst 0x44994009  // smlalb z9.s, p4/M, z0.h, z25.h\n"
+    ".inst 0x44994227  // smlalb z7.s, p4/M, z17.h, z25.h\n"
+    ".inst 0x44824617  // smlalt z23.s, p4/M, z16.h, z2.h\n"
+    ".inst 0x448b434e  // smlalb z14.s, p4/M, z26.h, z11.h\n"
+    "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
+    "addvl x15, x15, #2\n"
+    ".inst 0x44994792  // smlalt z18.s, p4/M, z28.h, z25.h\n"
+    ".inst 0x44994414  // smlalt z20.s, p4/M, z0.h, z25.h\n"
+    "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e139c  // ssublb z28.h, z28.b, z30.b\n"
+    ".inst 0x44994621  // smlalt z1.s, p4/M, z17.h, z25.h\n"
+    ".inst 0x44824346  // smlalb z6.s, p4/M, z26.h, z2.h\n"
+    "uzp1 z25.s, z22.s, z16.s\n"
+    "inch x2\n"
+    ".inst 0x448243a9  // smlalb z9.s, p4/M, z29.h, z2.h\n"
+    ".inst 0x44824067  // smlalb z7.s, p4/M, z3.h, z2.h\n"
+    "uzp2 z16.s, z22.s, z16.s\n"
+    "ld1w { z22.s }, p2/Z, [x14]\n"
+    ".inst 0x448b4757  // smlalt z23.s, p4/M, z26.h, z11.h\n"
+    ".inst 0x4498410e  // smlalb z14.s, p4/M, z8.h, z24.h\n"
+    "mov x20, x2\n"
+    "incw x20\n"
+    ".inst 0x44824752  // smlalt z18.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x448247b4  // smlalt z20.s, p4/M, z29.h, z2.h\n"
+    "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
+    "uzp1 z29.s, z22.s, z26.s\n"
+    ".inst 0x44824461  // smlalt z1.s, p4/M, z3.h, z2.h\n"
+    ".inst 0x448b4106  // smlalb z6.s, p4/M, z8.h, z11.h\n"
+    "uzp2 z22.s, z22.s, z26.s\n"
+    "whilelt p2.s, x2, x3\n"
+    ".inst 0x448b4069  // smlalb z9.s, p4/M, z3.h, z11.h\n"
+    ".inst 0x448b4087  // smlalb z7.s, p4/M, z4.h, z11.h\n"
+    "whilelt p1.s, x20, x3\n"
+    "whilelt p3.h, x2, x3\n"
+    ".inst 0x44984517  // smlalt z23.s, p4/M, z8.h, z24.h\n"
+    ".inst 0x449343ee  // smlalb z14.s, p4/M, z31.h, z19.h\n"
+    "addvl x14, x14, #2\n"
+    ".inst 0x448b4512  // smlalt z18.s, p4/M, z8.h, z11.h\n"
+    ".inst 0x448b4474  // smlalt z20.s, p4/M, z3.h, z11.h\n"
+    ".inst 0x448b4481  // smlalt z1.s, p4/M, z4.h, z11.h\n"
+    ".inst 0x449843e6  // smlalb z6.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984089  // smlalb z9.s, p4/M, z4.h, z24.h\n"
+    ".inst 0x44984367  // smlalb z7.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x449347f7  // smlalt z23.s, p4/M, z31.h, z19.h\n"
+    ".inst 0x4495400e  // smlalb z14.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x04b975ce  // sqrdmulh z14.s, z14.s, z25.s\n"
+    ".inst 0x449847f2  // smlalt z18.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984494  // smlalt z20.s, p4/M, z4.h, z24.h\n"
+    "and z3.d, z14.d, z29.d\n"
+    ".inst 0x44984761  // smlalt z1.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x44934006  // smlalb z6.s, p4/M, z0.h, z19.h\n"
+    "asr z3.s, z3.s, #0x1f\n"
+    ".inst 0x44934369  // smlalb z9.s, p4/M, z27.h, z19.h\n"
+    ".inst 0x449340a7  // smlalb z7.s, p4/M, z5.h, z19.h\n"
+    "sqadd z14.s, z14.s, z3.s\n"
+    ".inst 0x448293ae  // srshl z14.s, p4/M, z14.s, z29.s\n"
+    ".inst 0x44954417  // smlalt z23.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x44934412  // smlalt z18.s, p4/M, z0.h, z19.h\n"
+    ".inst 0x04b076f7  // sqrdmulh z23.s, z23.s, z16.s\n"
+    ".inst 0x44934774  // smlalt z20.s, p4/M, z27.h, z19.h\n"
+    ".inst 0x449344a1  // smlalt z1.s, p4/M, z5.h, z19.h\n"
+    "and z31.d, z23.d, z22.d\n"
+    ".inst 0x44954226  // smlalb z6.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x449540a9  // smlalb z9.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x04b974c6  // sqrdmulh z6.s, z6.s, z25.s\n"
+    ".inst 0x44954387  // smlalb z7.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x44954632  // smlalt z18.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x04b97529  // sqrdmulh z9.s, z9.s, z25.s\n"
+    ".inst 0x449544b4  // smlalt z20.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x44954781  // smlalt z1.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x04b974e7  // sqrdmulh z7.s, z7.s, z25.s\n"
+    "asr z31.s, z31.s, #0x1f\n"
+    "and z3.d, z6.d, z29.d\n"
+    ".inst 0x04b07652  // sqrdmulh z18.s, z18.s, z16.s\n"
+    "and z0.d, z9.d, z29.d\n"
+    ".inst 0x04b07694  // sqrdmulh z20.s, z20.s, z16.s\n"
+    "and z19.d, z7.d, z29.d\n"
+    ".inst 0x04b07421  // sqrdmulh z1.s, z1.s, z16.s\n"
+    "sqadd z23.s, z23.s, z31.s\n"
+    ".inst 0x448292d7  // srshl z23.s, p4/M, z23.s, z22.s\n"
+    "asr z3.s, z3.s, #0x1f\n"
+    "and z21.d, z18.d, z22.d\n"
+    "asr z0.s, z0.s, #0x1f\n"
+    "and z17.d, z20.d, z22.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "and z16.d, z1.d, z22.d\n"
+    "sqadd z6.s, z6.s, z3.s\n"
+    "asr z21.s, z21.s, #0x1f\n"
+    ".inst 0x448293a6  // srshl z6.s, p4/M, z6.s, z29.s\n"
+    "sqadd z9.s, z9.s, z0.s\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x448293a9  // srshl z9.s, p4/M, z9.s, z29.s\n"
+    "sqadd z7.s, z7.s, z19.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x448293a7  // srshl z7.s, p4/M, z7.s, z29.s\n"
+    "sqadd z18.s, z18.s, z21.s\n"
+    "sqadd z20.s, z20.s, z17.s\n"
+    ".inst 0x448292d2  // srshl z18.s, p4/M, z18.s, z22.s\n"
+    ".inst 0x448292d4  // srshl z20.s, p4/M, z20.s, z22.s\n"
+    "sqadd z1.s, z1.s, z16.s\n"
+    ".inst 0x453041ce  // sqxtnb z14.h, z14.s\n"
+    ".inst 0x448292c1  // srshl z1.s, p4/M, z1.s, z22.s\n"
+    ".inst 0x453040c6  // sqxtnb z6.h, z6.s\n"
+    ".inst 0x45304129  // sqxtnb z9.h, z9.s\n"
+    ".inst 0x453040e7  // sqxtnb z7.h, z7.s\n"
+    ".inst 0x453046ee  // sqxtnt z14.h, z23.s\n"
+    ".inst 0x45304646  // sqxtnt z6.h, z18.s\n"
+    ".inst 0x45304689  // sqxtnt z9.h, z20.s\n"
+    ".inst 0x45304427  // sqxtnt z7.h, z1.s\n"
+    "sqadd z14.h, z14.h, z15.h\n"
+    "smax z14.h, p4/M, z14.h, z12.h\n"
+    "smin z14.h, p4/M, z14.h, z13.h\n"
+    "sqadd z6.h, z6.h, z15.h\n"
+    "sqadd z9.h, z9.h, z15.h\n"
+    "smax z6.h, p4/M, z6.h, z12.h\n"
+    "smax z9.h, p4/M, z9.h, z12.h\n"
+    "sqadd z7.h, z7.h, z15.h\n"
+    "smax z7.h, p4/M, z7.h, z12.h\n"
+    "smin z6.h, p4/M, z6.h, z13.h\n"
+    "st1b { z14.h }, p0, [x5, x16]\n"
+    "smin z9.h, p4/M, z9.h, z13.h\n"
+    "smin z7.h, p4/M, z7.h, z13.h\n"
+    "st1b { z6.h }, p0, [x6, x16]\n"
+    "st1b { z9.h }, p0, [x7, x16]\n"
+    "st1b { z7.h }, p0, [x8, x16]\n"
+    "ld1w { z17.s }, p2/Z, [x21]\n"
+    "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+    "uzp1 z14.s, z17.s, z16.s\n"
+    "ld1sb { z26.h }, p4/Z, [x4]\n"
+    "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+    "uzp2 z23.s, z17.s, z16.s\n"
+    "addvl x21, x21, #2\n"
+    "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+    "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+    "inch x16\n"
+    "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+    "ldp x9, x28, [x17, #0x0]\n"
+    "mov z6.d, z14.d\n"
+    "mov z18.d, z23.d\n"
+    "ldp x27, x26, [x17, #0x10]\n"
+    "ldp x25, x24, [x17, #0x20]\n"
+    "mov z9.d, z14.d\n"
+    "mov z20.d, z23.d\n"
+    "ldp x23, x22, [x17, #0x30]\n"
+    "ldp x21, x20, [x17, #0x40]\n"
+    "mov z7.d, z14.d\n"
+    "mov z1.d, z23.d\n"
+    "ld1sb { z22.h }, p3/Z, [x9, x2]\n"
+    "ld1sb { z2.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x454a135a  // ssublb z26.h, z26.b, z10.b\n"
+    ".inst 0x454a1108  // ssublb z8.h, z8.b, z10.b\n"
+    "ld1sb { z11.h }, p3/Z, [x27, x2]\n"
+    "ld1sb { z3.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x454a1210  // ssublb z16.h, z16.b, z10.b\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    "ld1sb { z29.h }, p3/Z, [x25, x2]\n"
+    "ld1sb { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x454a1231  // ssublb z17.h, z17.b, z10.b\n"
+    ".inst 0x455e12d6  // ssublb z22.h, z22.b, z30.b\n"
+    "ld1sb { z31.h }, p3/Z, [x23, x2]\n"
+    "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1042  // ssublb z2.h, z2.b, z30.b\n"
+    ".inst 0x455e116b  // ssublb z11.h, z11.b, z30.b\n"
+    "ld1sb { z19.h }, p3/Z, [x21, x2]\n"
+    "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1063  // ssublb z3.h, z3.b, z30.b\n"
+    ".inst 0x455e13bd  // ssublb z29.h, z29.b, z30.b\n"
+    ".inst 0x455e1084  // ssublb z4.h, z4.b, z30.b\n"
+    ".inst 0x455e13ff  // ssublb z31.h, z31.b, z30.b\n"
+    ".inst 0x455e1000  // ssublb z0.h, z0.b, z30.b\n"
+    ".inst 0x455e1273  // ssublb z19.h, z19.b, z30.b\n"
+    ".inst 0x455e139c  // ssublb z28.h, z28.b, z30.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000000..abc09ee5a3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>;
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *)
+  : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..274b29dcfc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "mov x20, #0x9\n"
+    "whilelt p0.b, XZR, x20\n"
+    "ldr x23, [%x[inptrs], #0x8]\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "ldr x22, [%x[inptrs], #0x20]\n"
+    "ldr x21, [%x[inptrs], #0x0]\n"
+    "mov z13.b, #0x1\n"
+    "lsr z13.s, z13.s, #0x8\n"
+    "ld1b { z1.b }, p0/Z, [x23]\n"
+    "ld1b { z2.b }, p0/Z, [x20]\n"
+    "mov z8.d, z1.d\n"
+    "mov z27.d, z1.d\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "ld1b { z4.b }, p0/Z, [x22]\n"
+    "mov z31.d, z1.d\n"
+    "mov z28.d, z2.d\n"
+    "ld1b { z0.b }, p0/Z, [x21]\n"
+    "mov z30.d, z2.d\n"
+    "mov z26.d, z2.d\n"
+    "ld1b { z3.b }, p0/Z, [x20]\n"
+    "mov z22.d, z4.d\n"
+    "mov z10.d, z4.d\n"
+    "ptrue p2.b\n"
+    "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z18.d, z4.d\n"
+    "ext z8.b, z8.b, z8.b, #0x2\n"
+    "lsl x10, %x[n_channels], #0x2\n"
+    "neg z11.s, p2/M, z11.s\n"
+    "ext z27.b, z27.b, z27.b, #0x4\n"
+    "ext z31.b, z31.b, z31.b, #0x6\n"
+    "mov x9, #0x0\n"
+    "whilelt p0.b, x9, x10\n"
+    "ext z28.b, z28.b, z28.b, #0x2\n"
+    "ext z30.b, z30.b, z30.b, #0x4\n"
+    "ld1w { z14.s }, p0/Z, [%x[params]]\n"
+    "mov x28, #0x0\n"
+    "ext z26.b, z26.b, z26.b, #0x6\n"
+    "ext z22.b, z22.b, z22.b, #0x2\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "ext z10.b, z10.b, z10.b, #0x4\n"
+    "ext z18.b, z18.b, z18.b, #0x6\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "mov z21.d, z0.d\n"
+    "mov z20.d, z0.d\n"
+    "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "mov z19.d, z0.d\n"
+    "mov z24.d, z3.d\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "ld1b { z5.b }, p0/Z, [%x[params], #1, MUL VL]\n"
+    "mov z17.d, z3.d\n"
+    "mov z16.d, z3.d\n"
+    "ld1b { z6.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+    "ld1b { z7.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+    "ext z21.b, z21.b, z21.b, #0x2\n"
+    "ext z20.b, z20.b, z20.b, #0x4\n"
+    "addvl %x[params], %x[params], #4\n"
+    "ext z19.b, z19.b, z19.b, #0x6\n"
+    "zip1 z1.s, z1.s, z27.s\n"
+    "zip1 z8.s, z8.s, z31.s\n"
+    "zip1 z2.s, z2.s, z30.s\n"
+    "zip1 z28.s, z28.s, z26.s\n"
+    "ext z24.b, z24.b, z24.b, #0x2\n"
+    "ext z17.b, z17.b, z17.b, #0x4\n"
+    "ext z16.b, z16.b, z16.b, #0x6\n"
+    "zip1 z4.s, z4.s, z10.s\n"
+    "zip1 z22.s, z22.s, z18.s\n"
+    "zip1 z0.s, z0.s, z20.s\n"
+    "zip1 z21.s, z21.s, z19.s\n"
+    "zip1 z1.s, z1.s, z8.s\n"
+    "zip1 z2.s, z2.s, z28.s\n"
+    "zip1 z3.s, z3.s, z17.s\n"
+    "zip1 z24.s, z24.s, z16.s\n"
+    "zip1 z4.s, z4.s, z22.s\n"
+    "zip1 z0.s, z0.s, z21.s\n"
+    "mov z1.q, z1.q[0]\n"
+    "mov z2.q, z2.q[0]\n"
+    "zip1 z3.s, z3.s, z24.s\n"
+    "mov z4.q, z4.q[0]\n"
+    "mov z24.s, #0x0\n"
+    "mov z25.s, #0x0\n"
+    "sdot z24.s, z13.b, z1.b[0]\n"
+    "mov z23.s, #0x0\n"
+    "mov z22.s, #0x0\n"
+    "sdot z25.s, z13.b, z1.b[1]\n"
+    "mov z21.s, #0x0\n"
+    "mov z19.s, #0x0\n"
+    "sdot z23.s, z13.b, z1.b[2]\n"
+    "mov z10.s, #0x0\n"
+    "mov z8.s, #0x0\n"
+    "sdot z22.s, z13.b, z1.b[3]\n"
+    "mov z20.s, #0x0\n"
+    "mov z18.s, #0x0\n"
+    "sdot z21.s, z13.b, z2.b[0]\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0x0\n"
+    "sdot z19.s, z13.b, z2.b[1]\n"
+    "sdot z10.s, z13.b, z2.b[2]\n"
+    "sdot z8.s, z13.b, z2.b[3]\n"
+    "mov z0.q, z0.q[0]\n"
+    "sdot z20.s, z13.b, z4.b[0]\n"
+    "sdot z18.s, z13.b, z4.b[1]\n"
+    "mov z3.q, z3.q[0]\n"
+    "sdot z17.s, z13.b, z4.b[2]\n"
+    "sdot z16.s, z13.b, z4.b[3]\n"
+    "mov z31.s, #0x0\n"
+    "mov z30.s, #0x0\n"
+    "mov z26.s, #0x0\n"
+    "sdot z31.s, z13.b, z0.b[0]\n"
+    "mov z27.s, #0x0\n"
+    "mov z28.s, #0x0\n"
+    "sdot z30.s, z13.b, z0.b[1]\n"
+    "mov z29.s, #0x0\n"
+    "sdot z26.s, z13.b, z0.b[2]\n"
+    "sdot z27.s, z13.b, z0.b[3]\n"
+    "sdot z28.s, z13.b, z3.b[0]\n"
+    "sdot z29.s, z13.b, z3.b[1]\n"
+    "add z24.s, z24.s, z21.s\n"
+    "add z25.s, z25.s, z19.s\n"
+    "add z23.s, z23.s, z10.s\n"
+    "add z22.s, z22.s, z8.s\n"
+    "add z21.s, z20.s, z21.s\n"
+    "mov z20.s, #0x0\n"
+    "sdot z20.s, z13.b, z3.b[2]\n"
+    "add z19.s, z18.s, z19.s\n"
+    "mov z18.s, #0x0\n"
+    "sdot z18.s, z13.b, z3.b[3]\n"
+    "add z17.s, z17.s, z10.s\n"
+    "add z16.s, z16.s, z8.s\n"
+    "add z24.s, z24.s, z31.s\n"
+    "add z25.s, z25.s, z30.s\n"
+    "mul z24.s, p2/M, z24.s, z11.s\n"
+    "mul z25.s, p2/M, z25.s, z11.s\n"
+    "add z26.s, z23.s, z26.s\n"
+    "add z27.s, z22.s, z27.s\n"
+    "mul z26.s, p2/M, z26.s, z11.s\n"
+    "mul z27.s, p2/M, z27.s, z11.s\n"
+    "add z28.s, z21.s, z28.s\n"
+    "add z29.s, z19.s, z29.s\n"
+    "mul z28.s, p2/M, z28.s, z11.s\n"
+    "mul z29.s, p2/M, z29.s, z11.s\n"
+    "add z30.s, z17.s, z20.s\n"
+    "add z31.s, z16.s, z18.s\n"
+    "mul z30.s, p2/M, z30.s, z11.s\n"
+    "mul z31.s, p2/M, z31.s, z11.s\n"
+    "zip1 z19.s, z24.s, z26.s\n"
+    "zip1 z18.s, z25.s, z27.s\n"
+    "zip1 z17.s, z28.s, z30.s\n"
+    "zip1 z16.s, z29.s, z31.s\n"
+    "zip1 z22.s, z19.s, z18.s\n"
+    "zip1 z23.s, z17.s, z16.s\n"
+    "add z24.s, z24.s, z14.s\n"
+    "add z25.s, z25.s, z14.s\n"
+    "add z26.s, z26.s, z14.s\n"
+    "add z27.s, z27.s, z14.s\n"
+    "add z28.s, z28.s, z14.s\n"
+    "add z29.s, z29.s, z14.s\n"
+    "add z30.s, z30.s, z14.s\n"
+    "add z31.s, z31.s, z14.s\n"
+    "1:"  // Loop
+    "sdot z24.s, z5.b, z0.b[0]\n"
+    "sdot z25.s, z5.b, z0.b[1]\n"
+    "ld1w { z8.s }, p2/Z, [%x[params]]\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "sdot z26.s, z5.b, z0.b[2]\n"
+    "sdot z27.s, z5.b, z0.b[3]\n"
+    "incb x9\n"
+    "whilelt p1.s, x28, %x[n_channels]\n"
+    "sdot z24.s, z6.b, z1.b[0]\n"
+    "sdot z25.s, z6.b, z1.b[1]\n"
+    "whilelt p0.b, x9, x10\n"
+    "ld1w { z20.s }, p0/Z, [%x[params], #2, MUL VL]\n"
+    "sdot z26.s, z6.b, z1.b[2]\n"
+    "sdot z27.s, z6.b, z1.b[3]\n"
+    "sdot z28.s, z5.b, z2.b[0]\n"
+    "sdot z29.s, z5.b, z2.b[1]\n"
+    "sdot z30.s, z5.b, z2.b[2]\n"
+    "sdot z31.s, z5.b, z2.b[3]\n"
+    "ld1b { z5.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+    "sdot z24.s, z7.b, z2.b[0]\n"
+    "sdot z25.s, z7.b, z2.b[1]\n"
+    ".inst 0x04a87718  // sqrdmulh z24.s, z24.s, z8.s\n"
+    "sdot z26.s, z7.b, z2.b[2]\n"
+    "sdot z27.s, z7.b, z2.b[3]\n"
+    ".inst 0x04a87739  // sqrdmulh z25.s, z25.s, z8.s\n"
+    "sdot z28.s, z6.b, z3.b[0]\n"
+    "sdot z29.s, z6.b, z3.b[1]\n"
+    ".inst 0x04a8775a  // sqrdmulh z26.s, z26.s, z8.s\n"
+    "sdot z30.s, z6.b, z3.b[2]\n"
+    "sdot z31.s, z6.b, z3.b[3]\n"
+    ".inst 0x04a8777b  // sqrdmulh z27.s, z27.s, z8.s\n"
+    "ld1b { z6.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+    "sdot z28.s, z7.b, z4.b[0]\n"
+    "sdot z29.s, z7.b, z4.b[1]\n"
+    "and z19.d, z24.d, z21.d\n"
+    "sdot z30.s, z7.b, z4.b[2]\n"
+    "sdot z31.s, z7.b, z4.b[3]\n"
+    "and z18.d, z25.d, z21.d\n"
+    "ld1b { z7.b }, p0/Z, [%x[params], #5, MUL VL]\n"
+    "and z17.d, z26.d, z21.d\n"
+    "and z16.d, z27.d, z21.d\n"
+    "addvl %x[params], %x[params], #6\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04a8779c  // sqrdmulh z28.s, z28.s, z8.s\n"
+    ".inst 0x04a877bd  // sqrdmulh z29.s, z29.s, z8.s\n"
+    ".inst 0x04a877de  // sqrdmulh z30.s, z30.s, z8.s\n"
+    ".inst 0x04a877ff  // sqrdmulh z31.s, z31.s, z8.s\n"
+    "sqadd z24.s, z24.s, z19.s\n"
+    "sqadd z25.s, z25.s, z18.s\n"
+    ".inst 0x44828ab8  // srshl z24.s, p2/M, z24.s, z21.s\n"
+    ".inst 0x44828ab9  // srshl z25.s, p2/M, z25.s, z21.s\n"
+    "sqadd z26.s, z26.s, z17.s\n"
+    "sqadd z27.s, z27.s, z16.s\n"
+    ".inst 0x44828aba  // srshl z26.s, p2/M, z26.s, z21.s\n"
+    ".inst 0x44828abb  // srshl z27.s, p2/M, z27.s, z21.s\n"
+    "and z19.d, z28.d, z21.d\n"
+    "and z18.d, z29.d, z21.d\n"
+    "and z17.d, z30.d, z21.d\n"
+    "and z16.d, z31.d, z21.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z28.s, z28.s, z19.s\n"
+    "sqadd z29.s, z29.s, z18.s\n"
+    ".inst 0x44828abc  // srshl z28.s, p2/M, z28.s, z21.s\n"
+    ".inst 0x44828abd  // srshl z29.s, p2/M, z29.s, z21.s\n"
+    "sqadd z30.s, z30.s, z17.s\n"
+    "sqadd z31.s, z31.s, z16.s\n"
+    ".inst 0x44828abe  // srshl z30.s, p2/M, z30.s, z21.s\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    "add z24.s, z24.s, z9.s\n"
+    "add z25.s, z25.s, z9.s\n"
+    "smin z24.s, p2/M, z24.s, z12.s\n"
+    "smin z25.s, p2/M, z25.s, z12.s\n"
+    "add z26.s, z26.s, z9.s\n"
+    "add z27.s, z27.s, z9.s\n"
+    "smin z26.s, p2/M, z26.s, z12.s\n"
+    "smin z27.s, p2/M, z27.s, z12.s\n"
+    "add z28.s, z28.s, z9.s\n"
+    "add z29.s, z29.s, z9.s\n"
+    "smin z28.s, p2/M, z28.s, z12.s\n"
+    "smin z29.s, p2/M, z29.s, z12.s\n"
+    "add z30.s, z30.s, z9.s\n"
+    "add z31.s, z31.s, z9.s\n"
+    "smin z30.s, p2/M, z30.s, z12.s\n"
+    "smin z31.s, p2/M, z31.s, z12.s\n"
+    "smax z24.s, p2/M, z24.s, z15.s\n"
+    "smax z25.s, p2/M, z25.s, z15.s\n"
+    "st1b { z24.s }, p1, [x27, x28]\n"
+    "mov z24.s, z22.s[0]\n"
+    "smax z26.s, p2/M, z26.s, z15.s\n"
+    "smax z27.s, p2/M, z27.s, z15.s\n"
+    "st1b { z25.s }, p1, [x26, x28]\n"
+    "mov z25.s, z22.s[1]\n"
+    "smax z28.s, p2/M, z28.s, z15.s\n"
+    "smax z29.s, p2/M, z29.s, z15.s\n"
+    "st1b { z26.s }, p1, [x25, x28]\n"
+    "mov z26.s, z22.s[2]\n"
+    "smax z30.s, p2/M, z30.s, z15.s\n"
+    "smax z31.s, p2/M, z31.s, z15.s\n"
+    "st1b { z27.s }, p1, [x24, x28]\n"
+    "mov z27.s, z22.s[3]\n"
+    "st1b { z28.s }, p1, [x23, x28]\n"
+    "mov z28.s, z23.s[0]\n"
+    "add z24.s, z24.s, z20.s\n"
+    "st1b { z29.s }, p1, [x22, x28]\n"
+    "mov z29.s, z23.s[1]\n"
+    "add z25.s, z25.s, z20.s\n"
+    "st1b { z30.s }, p1, [x21, x28]\n"
+    "mov z30.s, z23.s[2]\n"
+    "add z26.s, z26.s, z20.s\n"
+    "st1b { z31.s }, p1, [x20, x28]\n"
+    "mov z31.s, z23.s[3]\n"
+    "incw x28\n"
+    "add z27.s, z27.s, z20.s\n"
+    "add z28.s, z28.s, z20.s\n"
+    "add z29.s, z29.s, z20.s\n"
+    "add z30.s, z30.s, z20.s\n"
+    "add z31.s, z31.s, z20.s\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..701948f264
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>;
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *)
+  : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..a3b2b429c0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "mov x20, #0x6\n"
+    "whilelt p0.b, XZR, x20\n"
+    "ldr x22, [%x[inptrs], #0x18]\n"
+    "ldr x21, [%x[inptrs], #0x20]\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "ld1b { z3.b }, p0/Z, [x22]\n"
+    "mov z23.d, z3.d\n"
+    "ext z23.b, z23.b, z23.b, #0x1\n"
+    "ld1b { z4.b }, p0/Z, [x21]\n"
+    "ldr x24, [%x[inptrs], #0x8]\n"
+    "mov z18.d, z4.d\n"
+    "ext z18.b, z18.b, z18.b, #0x1\n"
+    "ld1b { z2.b }, p0/Z, [x20]\n"
+    "ldr x23, [%x[inptrs], #0x28]\n"
+    "mov z15.d, z2.d\n"
+    "ext z15.b, z15.b, z15.b, #0x1\n"
+    "ldr x22, [%x[inptrs], #0x30]\n"
+    "ldr x21, [%x[inptrs], #0x38]\n"
+    "zip1 z3.d, z3.d, z23.d\n"
+    "zip1 z4.d, z4.d, z18.d\n"
+    "ldr x20, [%x[inptrs], #0x0]\n"
+    "ld1b { z1.b }, p0/Z, [x24]\n"
+    "mov z19.d, z1.d\n"
+    "ext z19.b, z19.b, z19.b, #0x1\n"
+    "ld1b { z5.b }, p0/Z, [x23]\n"
+    "ld1b { z6.b }, p0/Z, [x22]\n"
+    "mov z18.d, z5.d\n"
+    "mov z22.d, z6.d\n"
+    "ld1b { z7.b }, p0/Z, [x21]\n"
+    "ld1b { z0.b }, p0/Z, [x20]\n"
+    "mov z8.d, z7.d\n"
+    "zip1 z2.d, z2.d, z15.d\n"
+    "mov z3.q, z3.q[0]\n"
+    "mov z4.q, z4.q[0]\n"
+    "ptrue p2.b\n"
+    "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "ext z18.b, z18.b, z18.b, #0x1\n"
+    "ext z22.b, z22.b, z22.b, #0x1\n"
+    "lsl x10, %x[n_channels], #0x2\n"
+    "neg z23.s, p2/M, z23.s\n"
+    "ext z8.b, z8.b, z8.b, #0x1\n"
+    "mov z28.b, #0x1\n"
+    "mov x9, #0x0\n"
+    "whilelt p0.b, x9, x10\n"
+    "mov z25.s, #0x0\n"
+    "mov z24.s, #0x0\n"
+    "sdot z25.s, z28.b, z3.b[0]\n"
+    "ld1w { z12.s }, p0/Z, [%x[params]]\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0x0\n"
+    "sdot z24.s, z28.b, z3.b[2]\n"
+    "mov x28, #0x0\n"
+    "mov z27.d, z0.d\n"
+    "sdot z17.s, z28.b, z4.b[0]\n"
+    "sdot z16.s, z28.b, z4.b[2]\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "ext z27.b, z27.b, z27.b, #0x1\n"
+    "zip1 z1.d, z1.d, z19.d\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "mov z2.q, z2.q[0]\n"
+    "zip1 z5.d, z5.d, z18.d\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "zip1 z6.d, z6.d, z22.d\n"
+    "zip1 z7.d, z7.d, z8.d\n"
+    "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "mov z30.s, #0x0\n"
+    "mov z31.s, #0x0\n"
+    "sdot z30.s, z28.b, z2.b[0]\n"
+    "ld1b { z8.b }, p0/Z, [%x[params], #1, MUL VL]\n"
+    "mov z29.s, #0x1\n"
+    "sdot z31.s, z28.b, z2.b[2]\n"
+    "sdot z25.s, z29.b, z3.b[1]\n"
+    "ld1b { z9.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+    "zip1 z0.d, z0.d, z27.d\n"
+    "mov z1.q, z1.q[0]\n"
+    "sdot z24.s, z29.b, z3.b[3]\n"
+    "ld1b { z10.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+    "mov z5.q, z5.q[0]\n"
+    "mov z6.q, z6.q[0]\n"
+    "sdot z17.s, z29.b, z4.b[1]\n"
+    "ld1b { z11.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+    "mov z7.q, z7.q[0]\n"
+    "mov z22.s, #0x0\n"
+    "sdot z16.s, z29.b, z4.b[3]\n"
+    "addvl %x[params], %x[params], #5\n"
+    "mov z21.s, #0x0\n"
+    "mov z26.s, #0x0\n"
+    "sdot z22.s, z28.b, z1.b[0]\n"
+    "mov z27.s, #0x0\n"
+    "mov z20.s, #0x0\n"
+    "sdot z21.s, z28.b, z1.b[2]\n"
+    "mov z19.s, #0x0\n"
+    "mov z18.s, #0x0\n"
+    "sdot z26.s, z28.b, z5.b[0]\n"
+    "sdot z27.s, z28.b, z5.b[2]\n"
+    "sdot z20.s, z28.b, z6.b[0]\n"
+    "mov z0.q, z0.q[0]\n"
+    "sdot z19.s, z28.b, z6.b[2]\n"
+    "sdot z18.s, z28.b, z7.b[0]\n"
+    "add z17.s, z25.s, z17.s\n"
+    "mov z25.s, #0x0\n"
+    "sdot z25.s, z28.b, z7.b[2]\n"
+    "sdot z30.s, z29.b, z2.b[1]\n"
+    "sdot z31.s, z29.b, z2.b[3]\n"
+    "add z16.s, z24.s, z16.s\n"
+    "sdot z22.s, z29.b, z1.b[1]\n"
+    "mov z24.s, #0x0\n"
+    "sdot z24.s, z28.b, z0.b[0]\n"
+    "sdot z21.s, z29.b, z1.b[3]\n"
+    "sdot z26.s, z29.b, z5.b[1]\n"
+    "sdot z27.s, z29.b, z5.b[3]\n"
+    "add z30.s, z30.s, z17.s\n"
+    "sdot z20.s, z29.b, z6.b[1]\n"
+    "sdot z19.s, z29.b, z6.b[3]\n"
+    "add z31.s, z31.s, z16.s\n"
+    "sdot z18.s, z29.b, z7.b[1]\n"
+    "sdot z25.s, z29.b, z7.b[3]\n"
+    "add z22.s, z22.s, z30.s\n"
+    "sdot z24.s, z29.b, z0.b[1]\n"
+    "add z21.s, z21.s, z31.s\n"
+    "add z20.s, z26.s, z20.s\n"
+    "add z19.s, z27.s, z19.s\n"
+    "add z18.s, z18.s, z17.s\n"
+    "mov z17.s, #0x0\n"
+    "sdot z17.s, z28.b, z0.b[2]\n"
+    "sdot z17.s, z29.b, z0.b[3]\n"
+    "add z16.s, z25.s, z16.s\n"
+    "add z24.s, z22.s, z24.s\n"
+    "add z25.s, z21.s, z17.s\n"
+    "mul z24.s, p2/M, z24.s, z23.s\n"
+    "mul z25.s, p2/M, z25.s, z23.s\n"
+    "add z26.s, z26.s, z22.s\n"
+    "add z27.s, z27.s, z21.s\n"
+    "mul z26.s, p2/M, z26.s, z23.s\n"
+    "mul z27.s, p2/M, z27.s, z23.s\n"
+    "add z28.s, z20.s, z30.s\n"
+    "add z29.s, z19.s, z31.s\n"
+    "mul z28.s, p2/M, z28.s, z23.s\n"
+    "mul z29.s, p2/M, z29.s, z23.s\n"
+    "add z30.s, z20.s, z18.s\n"
+    "add z31.s, z19.s, z16.s\n"
+    "mul z30.s, p2/M, z30.s, z23.s\n"
+    "mul z31.s, p2/M, z31.s, z23.s\n"
+    "zip1 z19.s, z24.s, z26.s\n"
+    "zip1 z18.s, z25.s, z27.s\n"
+    "zip1 z17.s, z28.s, z30.s\n"
+    "zip1 z16.s, z29.s, z31.s\n"
+    "zip1 z22.s, z19.s, z18.s\n"
+    "zip1 z23.s, z17.s, z16.s\n"
+    "add z24.s, z24.s, z12.s\n"
+    "add z25.s, z25.s, z12.s\n"
+    "add z26.s, z26.s, z12.s\n"
+    "add z27.s, z27.s, z12.s\n"
+    "add z28.s, z28.s, z12.s\n"
+    "add z29.s, z29.s, z12.s\n"
+    "add z30.s, z30.s, z12.s\n"
+    "add z31.s, z31.s, z12.s\n"
+    "1:"  // Loop
+    "sdot z24.s, z8.b, z0.b[0]\n"
+    "sdot z25.s, z8.b, z0.b[2]\n"
+    "ld1w { z12.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "sdot z26.s, z8.b, z1.b[0]\n"
+    "sdot z27.s, z8.b, z1.b[2]\n"
+    "incb x9\n"
+    "whilelt p1.s, x28, %x[n_channels]\n"
+    "sdot z24.s, z9.b, z0.b[1]\n"
+    "sdot z25.s, z9.b, z0.b[3]\n"
+    "whilelt p0.b, x9, x10\n"
+    "sdot z26.s, z9.b, z1.b[1]\n"
+    "sdot z27.s, z9.b, z1.b[3]\n"
+    "sdot z28.s, z8.b, z2.b[0]\n"
+    "sdot z29.s, z8.b, z2.b[2]\n"
+    "sdot z30.s, z8.b, z3.b[0]\n"
+    "sdot z31.s, z8.b, z3.b[2]\n"
+    "ld1b { z17.b }, p2/Z, [%x[params]]\n"
+    "sdot z24.s, z10.b, z1.b[0]\n"
+    "sdot z25.s, z10.b, z1.b[2]\n"
+    "sdot z26.s, z10.b, z2.b[0]\n"
+    "sdot z27.s, z10.b, z2.b[2]\n"
+    "sdot z28.s, z9.b, z2.b[1]\n"
+    "sdot z29.s, z9.b, z2.b[3]\n"
+    "sdot z30.s, z9.b, z3.b[1]\n"
+    "sdot z31.s, z9.b, z3.b[3]\n"
+    "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "sdot z24.s, z11.b, z1.b[1]\n"
+    "sdot z25.s, z11.b, z1.b[3]\n"
+    "sdot z26.s, z11.b, z2.b[1]\n"
+    "sdot z27.s, z11.b, z2.b[3]\n"
+    "sdot z28.s, z10.b, z3.b[0]\n"
+    "sdot z29.s, z10.b, z3.b[2]\n"
+    "sdot z30.s, z10.b, z4.b[0]\n"
+    "sdot z31.s, z10.b, z4.b[2]\n"
+    "ld1b { z19.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "sdot z24.s, z17.b, z2.b[0]\n"
+    "sdot z25.s, z17.b, z2.b[2]\n"
+    "sdot z26.s, z17.b, z3.b[0]\n"
+    "sdot z27.s, z17.b, z3.b[2]\n"
+    "sdot z28.s, z11.b, z3.b[1]\n"
+    "sdot z29.s, z11.b, z3.b[3]\n"
+    "sdot z30.s, z11.b, z4.b[1]\n"
+    "sdot z31.s, z11.b, z4.b[3]\n"
+    "ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "sdot z24.s, z16.b, z2.b[1]\n"
+    "sdot z25.s, z16.b, z2.b[3]\n"
+    "sdot z26.s, z16.b, z3.b[1]\n"
+    "sdot z27.s, z16.b, z3.b[3]\n"
+    "sdot z28.s, z17.b, z4.b[0]\n"
+    "sdot z29.s, z17.b, z4.b[2]\n"
+    "sdot z30.s, z17.b, z5.b[0]\n"
+    "sdot z31.s, z17.b, z5.b[2]\n"
+    "ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "sdot z24.s, z19.b, z3.b[0]\n"
+    "sdot z25.s, z19.b, z3.b[2]\n"
+    "sdot z26.s, z19.b, z4.b[0]\n"
+    "sdot z27.s, z19.b, z4.b[2]\n"
+    "sdot z28.s, z16.b, z4.b[1]\n"
+    "sdot z29.s, z16.b, z4.b[3]\n"
+    "sdot z30.s, z16.b, z5.b[1]\n"
+    "sdot z31.s, z16.b, z5.b[3]\n"
+    "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "sdot z24.s, z18.b, z3.b[1]\n"
+    "sdot z25.s, z18.b, z3.b[3]\n"
+    "ld1w { z20.s }, p0/Z, [%x[params], #-8, MUL VL]\n"
+    "sdot z26.s, z18.b, z4.b[1]\n"
+    "sdot z27.s, z18.b, z4.b[3]\n"
+    "sdot z28.s, z19.b, z5.b[0]\n"
+    "sdot z29.s, z19.b, z5.b[2]\n"
+    "sdot z30.s, z19.b, z6.b[0]\n"
+    "sdot z31.s, z19.b, z6.b[2]\n"
+    "ld1b { z10.b }, p0/Z, [%x[params], #-5, MUL VL]\n"
+    "sdot z24.s, z17.b, z4.b[0]\n"
+    "sdot z25.s, z17.b, z4.b[2]\n"
+    "sdot z26.s, z17.b, z5.b[0]\n"
+    "sdot z27.s, z17.b, z5.b[2]\n"
+    "sdot z28.s, z18.b, z5.b[1]\n"
+    "sdot z29.s, z18.b, z5.b[3]\n"
+    "sdot z30.s, z18.b, z6.b[1]\n"
+    "sdot z31.s, z18.b, z6.b[3]\n"
+    "ld1b { z11.b }, p0/Z, [%x[params], #-4, MUL VL]\n"
+    "sdot z24.s, z16.b, z4.b[1]\n"
+    "sdot z25.s, z16.b, z4.b[3]\n"
+    ".inst 0x04ac7718  // sqrdmulh z24.s, z24.s, z12.s\n"
+    "sdot z26.s, z16.b, z5.b[1]\n"
+    "sdot z27.s, z16.b, z5.b[3]\n"
+    ".inst 0x04ac7739  // sqrdmulh z25.s, z25.s, z12.s\n"
+    "sdot z28.s, z17.b, z6.b[0]\n"
+    "sdot z29.s, z17.b, z6.b[2]\n"
+    ".inst 0x04ac775a  // sqrdmulh z26.s, z26.s, z12.s\n"
+    "sdot z30.s, z17.b, z7.b[0]\n"
+    "sdot z31.s, z17.b, z7.b[2]\n"
+    ".inst 0x04ac777b  // sqrdmulh z27.s, z27.s, z12.s\n"
+    "ld1b { z8.b }, p0/Z, [%x[params], #-7, MUL VL]\n"
+    "sdot z28.s, z16.b, z6.b[1]\n"
+    "sdot z29.s, z16.b, z6.b[3]\n"
+    "and z19.d, z24.d, z21.d\n"
+    "sdot z30.s, z16.b, z7.b[1]\n"
+    "sdot z31.s, z16.b, z7.b[3]\n"
+    "and z18.d, z25.d, z21.d\n"
+    "ld1b { z9.b }, p0/Z, [%x[params], #-6, MUL VL]\n"
+    "and z17.d, z26.d, z21.d\n"
+    "and z16.d, z27.d, z21.d\n"
+    "addvl %x[params], %x[params], #-3\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04ac779c  // sqrdmulh z28.s, z28.s, z12.s\n"
+    ".inst 0x04ac77bd  // sqrdmulh z29.s, z29.s, z12.s\n"
+    ".inst 0x04ac77de  // sqrdmulh z30.s, z30.s, z12.s\n"
+    ".inst 0x04ac77ff  // sqrdmulh z31.s, z31.s, z12.s\n"
+    "sqadd z24.s, z24.s, z19.s\n"
+    "sqadd z25.s, z25.s, z18.s\n"
+    ".inst 0x44828ab8  // srshl z24.s, p2/M, z24.s, z21.s\n"
+    ".inst 0x44828ab9  // srshl z25.s, p2/M, z25.s, z21.s\n"
+    "sqadd z26.s, z26.s, z17.s\n"
+    "sqadd z27.s, z27.s, z16.s\n"
+    ".inst 0x44828aba  // srshl z26.s, p2/M, z26.s, z21.s\n"
+    ".inst 0x44828abb  // srshl z27.s, p2/M, z27.s, z21.s\n"
+    "and z19.d, z28.d, z21.d\n"
+    "and z18.d, z29.d, z21.d\n"
+    "and z17.d, z30.d, z21.d\n"
+    "and z16.d, z31.d, z21.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z28.s, z28.s, z19.s\n"
+    "sqadd z29.s, z29.s, z18.s\n"
+    ".inst 0x44828abc  // srshl z28.s, p2/M, z28.s, z21.s\n"
+    ".inst 0x44828abd  // srshl z29.s, p2/M, z29.s, z21.s\n"
+    "sqadd z30.s, z30.s, z17.s\n"
+    "sqadd z31.s, z31.s, z16.s\n"
+    ".inst 0x44828abe  // srshl z30.s, p2/M, z30.s, z21.s\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    "add z24.s, z24.s, z13.s\n"
+    "add z25.s, z25.s, z13.s\n"
+    "smin z24.s, p2/M, z24.s, z15.s\n"
+    "smin z25.s, p2/M, z25.s, z15.s\n"
+    "add z26.s, z26.s, z13.s\n"
+    "add z27.s, z27.s, z13.s\n"
+    "smin z26.s, p2/M, z26.s, z15.s\n"
+    "smin z27.s, p2/M, z27.s, z15.s\n"
+    "add z28.s, z28.s, z13.s\n"
+    "add z29.s, z29.s, z13.s\n"
+    "smin z28.s, p2/M, z28.s, z15.s\n"
+    "smin z29.s, p2/M, z29.s, z15.s\n"
+    "add z30.s, z30.s, z13.s\n"
+    "add z31.s, z31.s, z13.s\n"
+    "smin z30.s, p2/M, z30.s, z15.s\n"
+    "smin z31.s, p2/M, z31.s, z15.s\n"
+    "smax z24.s, p2/M, z24.s, z14.s\n"
+    "smax z25.s, p2/M, z25.s, z14.s\n"
+    "st1b { z24.s }, p1, [x27, x28]\n"
+    "mov z24.s, z22.s[0]\n"
+    "smax z26.s, p2/M, z26.s, z14.s\n"
+    "smax z27.s, p2/M, z27.s, z14.s\n"
+    "st1b { z25.s }, p1, [x26, x28]\n"
+    "mov z25.s, z22.s[1]\n"
+    "smax z28.s, p2/M, z28.s, z14.s\n"
+    "smax z29.s, p2/M, z29.s, z14.s\n"
+    "st1b { z26.s }, p1, [x25, x28]\n"
+    "mov z26.s, z22.s[2]\n"
+    "smax z30.s, p2/M, z30.s, z14.s\n"
+    "smax z31.s, p2/M, z31.s, z14.s\n"
+    "st1b { z27.s }, p1, [x24, x28]\n"
+    "mov z27.s, z22.s[3]\n"
+    "st1b { z28.s }, p1, [x23, x28]\n"
+    "mov z28.s, z23.s[0]\n"
+    "add z24.s, z24.s, z20.s\n"
+    "st1b { z29.s }, p1, [x22, x28]\n"
+    "mov z29.s, z23.s[1]\n"
+    "add z25.s, z25.s, z20.s\n"
+    "st1b { z30.s }, p1, [x21, x28]\n"
+    "mov z30.s, z23.s[2]\n"
+    "add z26.s, z26.s, z20.s\n"
+    "st1b { z31.s }, p1, [x20, x28]\n"
+    "mov z31.s, z23.s[3]\n"
+    "incw x28\n"
+    "add z27.s, z27.s, z20.s\n"
+    "add z28.s, z28.s, z20.s\n"
+    "add z29.s, z29.s, z20.s\n"
+    "add z30.s, z30.s, z20.s\n"
+    "add z31.s, z31.s, z20.s\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..6799b10ed9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *);
+
+class sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleave_sve_s8q_3x3_dot::get_packed_size(args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleave_sve_s8q_3x3_dot::pack_parameters(
+      args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+      reinterpret_cast<const int8_t *>(weights), qp, ld_weight_col, ld_weight_row
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d9c8644fc4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
+{
+  __asm__ __volatile__(
+    "mov x13, #0x0\n"
+    "whilelt p0.b, x13, %x[n_channels]\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "ldp x25, x24, [%x[inptrs], #0x10]\n"
+    "ldp x23, x22, [%x[inptrs], #0x20]\n"
+    "ldp x21, x20, [%x[inptrs], #0x30]\n"
+    "ptrue p2.b\n"
+    "mov x12, #0x0\n"
+    "ldp x11, x10, [%x[outptrs], #0x0]\n"
+    "ldp x9, x28, [%x[outptrs], #0x10]\n"
+    "ld1b { z15.b }, p0/Z, [x27, x13]\n"
+    "ld1b { z18.b }, p0/Z, [x26, x13]\n"
+    "ldp x27, x26, [%x[inptrs], #0x40]\n"
+    "ld1b { z16.b }, p0/Z, [x25, x13]\n"
+    "zip2 z17.b, z15.b, z16.b\n"
+    "zip1 z15.b, z15.b, z16.b\n"
+    "ld1b { z14.b }, p0/Z, [x24, x13]\n"
+    "ldp x25, x24, [%x[inptrs], #0x50]\n"
+    "zip1 z16.b, z18.b, z14.b\n"
+    "zip2 z14.b, z18.b, z14.b\n"
+    "ld1b { z13.b }, p0/Z, [x23, x13]\n"
+    "ld1b { z18.b }, p0/Z, [x22, x13]\n"
+    "zip2 z12.b, z15.b, z16.b\n"
+    "zip1 z15.b, z15.b, z16.b\n"
+    "ldp x23, x22, [%x[inptrs], #0x60]\n"
+    "ld1b { z16.b }, p0/Z, [x21, x13]\n"
+    "zip1 z11.b, z17.b, z14.b\n"
+    "zip2 z14.b, z17.b, z14.b\n"
+    "ld1b { z10.b }, p0/Z, [x20, x13]\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "zip2 z22.b, z13.b, z16.b\n"
+    "zip1 z13.b, z13.b, z16.b\n"
+    "ld1b { z9.b }, p0/Z, [x27, x13]\n"
+    "ld1b { z17.b }, p0/Z, [x26, x13]\n"
+    "zip1 z21.b, z18.b, z10.b\n"
+    "zip2 z10.b, z18.b, z10.b\n"
+    "ld1b { z16.b }, p0/Z, [x25, x13]\n"
+    "ld1b { z8.b }, p0/Z, [x24, x13]\n"
+    "zip2 z20.b, z9.b, z16.b\n"
+    "zip1 z9.b, z9.b, z16.b\n"
+    "ld1b { z7.b }, p0/Z, [x23, x13]\n"
+    "ld1b { z19.b }, p0/Z, [x22, x13]\n"
+    "zip1 z18.b, z17.b, z8.b\n"
+    "zip2 z8.b, z17.b, z8.b\n"
+    "ld1b { z16.b }, p0/Z, [x21, x13]\n"
+    "ld1b { z6.b }, p0/Z, [x20, x13]\n"
+    "zip2 z17.b, z7.b, z16.b\n"
+    "zip1 z7.b, z7.b, z16.b\n"
+    "zip1 z16.b, z19.b, z6.b\n"
+    "zip2 z6.b, z19.b, z6.b\n"
+    "ld1w { z5.s }, p2/Z, [%x[params]]\n"
+    "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "ld1rw { z2.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "zip2 z1.b, z13.b, z21.b\n"
+    "zip1 z13.b, z13.b, z21.b\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "ldp x25, x23, [%x[inptrs], #0x10]\n"
+    "zip1 z0.b, z22.b, z10.b\n"
+    "zip2 z10.b, z22.b, z10.b\n"
+    "ldp x24, x22, [%x[inptrs], #0x20]\n"
+    "ldp x21, x20, [%x[inptrs], #0x30]\n"
+    "zip2 z31.b, z9.b, z18.b\n"
+    "zip1 z9.b, z9.b, z18.b\n"
+    "zip1 z30.b, z20.b, z8.b\n"
+    "zip2 z8.b, z20.b, z8.b\n"
+    "ld1b { z29.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "ld1b { z28.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "zip2 z27.b, z7.b, z16.b\n"
+    "zip1 z7.b, z7.b, z16.b\n"
+    "ld1b { z26.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "addvl %x[params], %x[params], #4\n"
+    "zip1 z25.b, z17.b, z6.b\n"
+    "zip2 z6.b, z17.b, z6.b\n"
+    "mov z24.d, z5.d\n"
+    "mov z22.d, z5.d\n"
+    "mov z21.d, z5.d\n"
+    "1:"  // Loop
+    "sdot z5.s, z29.b, z15.b\n"
+    "sdot z22.s, z29.b, z13.b\n"
+    "ext z15.b, z15.b, z15.b, #0x1\n"
+    "whilelt p0.s, x12, %x[n_channels]\n"
+    "sdot z5.s, z28.b, z13.b\n"
+    "ext z13.b, z13.b, z13.b, #0x1\n"
+    "sdot z24.s, z29.b, z15.b\n"
+    "ld1w { z17.s }, p2/Z, [%x[params]]\n"
+    "sdot z21.s, z29.b, z13.b\n"
+    "sdot z22.s, z28.b, z9.b\n"
+    "incw x13, ALL, MUL #4\n"
+    "sdot z5.s, z26.b, z9.b\n"
+    "ext z9.b, z9.b, z9.b, #0x1\n"
+    "sdot z24.s, z28.b, z13.b\n"
+    "ld1w { z20.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "sdot z21.s, z28.b, z9.b\n"
+    "sdot z22.s, z26.b, z7.b\n"
+    "ext z7.b, z7.b, z7.b, #0x1\n"
+    ".inst 0x04b174a5  // sqrdmulh z5.s, z5.s, z17.s\n"
+    "sdot z24.s, z26.b, z9.b\n"
+    "sdot z21.s, z26.b, z7.b\n"
+    "and z16.d, z5.d, z20.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04b17718  // sqrdmulh z24.s, z24.s, z17.s\n"
+    ".inst 0x04b176d6  // sqrdmulh z22.s, z22.s, z17.s\n"
+    ".inst 0x04b176b5  // sqrdmulh z21.s, z21.s, z17.s\n"
+    "sqadd z5.s, z5.s, z16.s\n"
+    ".inst 0x44828a85  // srshl z5.s, p2/M, z5.s, z20.s\n"
+    "ld1w { z19.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "and z18.d, z24.d, z20.d\n"
+    "and z17.d, z22.d, z20.d\n"
+    "and z16.d, z21.d, z20.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z24.s, z24.s, z18.s\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    ".inst 0x44828a98  // srshl z24.s, p2/M, z24.s, z20.s\n"
+    ".inst 0x44828a96  // srshl z22.s, p2/M, z22.s, z20.s\n"
+    "sqadd z21.s, z21.s, z16.s\n"
+    "add z5.s, z5.s, z2.s\n"
+    ".inst 0x44828a95  // srshl z21.s, p2/M, z21.s, z20.s\n"
+    "smax z5.s, p2/M, z5.s, z4.s\n"
+    "add z24.s, z24.s, z2.s\n"
+    "add z22.s, z22.s, z2.s\n"
+    "smin z5.s, p2/M, z5.s, z3.s\n"
+    "smax z24.s, p2/M, z24.s, z4.s\n"
+    "add z21.s, z21.s, z2.s\n"
+    "smax z22.s, p2/M, z22.s, z4.s\n"
+    "smax z21.s, p2/M, z21.s, z4.s\n"
+    "st1b { z5.s }, p0, [x11, x12]\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "smin z24.s, p2/M, z24.s, z3.s\n"
+    "smin z22.s, p2/M, z22.s, z3.s\n"
+    "smin z21.s, p2/M, z21.s, z3.s\n"
+    "st1b { z24.s }, p0, [x10, x12]\n"
+    "mov z24.d, z23.d\n"
+    "ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "st1b { z22.s }, p0, [x9, x12]\n"
+    "mov z22.d, z23.d\n"
+    "sdot z22.s, z18.b, z1.b\n"
+    "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "st1b { z21.s }, p0, [x28, x12]\n"
+    "mov z21.d, z23.d\n"
+    "sdot z23.s, z18.b, z12.b\n"
+    "sdot z23.s, z17.b, z1.b\n"
+    "ext z12.b, z12.b, z12.b, #0x1\n"
+    "ext z1.b, z1.b, z1.b, #0x1\n"
+    "sdot z24.s, z18.b, z12.b\n"
+    "ld1w { z20.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "sdot z21.s, z18.b, z1.b\n"
+    "sdot z22.s, z17.b, z31.b\n"
+    "incw x12\n"
+    "whilelt p0.s, x12, %x[n_channels]\n"
+    "sdot z23.s, z16.b, z31.b\n"
+    "ext z31.b, z31.b, z31.b, #0x1\n"
+    "sdot z24.s, z17.b, z1.b\n"
+    "addvl %x[params], %x[params], #16\n"
+    "sdot z21.s, z17.b, z31.b\n"
+    "sdot z22.s, z16.b, z27.b\n"
+    "ext z27.b, z27.b, z27.b, #0x1\n"
+    ".inst 0x04b376f7  // sqrdmulh z23.s, z23.s, z19.s\n"
+    "sdot z24.s, z16.b, z31.b\n"
+    "sdot z21.s, z16.b, z27.b\n"
+    "and z16.d, z23.d, z20.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04b37718  // sqrdmulh z24.s, z24.s, z19.s\n"
+    ".inst 0x04b376d6  // sqrdmulh z22.s, z22.s, z19.s\n"
+    ".inst 0x04b376b5  // sqrdmulh z21.s, z21.s, z19.s\n"
+    "sqadd z23.s, z23.s, z16.s\n"
+    ".inst 0x44828a97  // srshl z23.s, p2/M, z23.s, z20.s\n"
+    "ld1w { z19.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+    "and z18.d, z24.d, z20.d\n"
+    "and z17.d, z22.d, z20.d\n"
+    "and z16.d, z21.d, z20.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z24.s, z24.s, z18.s\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    ".inst 0x44828a98  // srshl z24.s, p2/M, z24.s, z20.s\n"
+    ".inst 0x44828a96  // srshl z22.s, p2/M, z22.s, z20.s\n"
+    "sqadd z21.s, z21.s, z16.s\n"
+    "add z23.s, z23.s, z2.s\n"
+    ".inst 0x44828a95  // srshl z21.s, p2/M, z21.s, z20.s\n"
+    "smax z23.s, p2/M, z23.s, z4.s\n"
+    "add z24.s, z24.s, z2.s\n"
+    "add z22.s, z22.s, z2.s\n"
+    "smin z23.s, p2/M, z23.s, z3.s\n"
+    "smax z24.s, p2/M, z24.s, z4.s\n"
+    "add z21.s, z21.s, z2.s\n"
+    "smax z22.s, p2/M, z22.s, z4.s\n"
+    "smax z21.s, p2/M, z21.s, z4.s\n"
+    "st1b { z23.s }, p0, [x11, x12]\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "ld1b { z18.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "smin z24.s, p2/M, z24.s, z3.s\n"
+    "smin z22.s, p2/M, z22.s, z3.s\n"
+    "smin z21.s, p2/M, z21.s, z3.s\n"
+    "st1b { z24.s }, p0, [x10, x12]\n"
+    "mov z24.d, z23.d\n"
+    "ld1b { z17.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
+    "st1b { z22.s }, p0, [x9, x12]\n"
+    "mov z22.d, z23.d\n"
+    "sdot z22.s, z18.b, z0.b\n"
+    "ld1b { z16.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+    "st1b { z21.s }, p0, [x28, x12]\n"
+    "mov z21.d, z23.d\n"
+    "sdot z23.s, z18.b, z11.b\n"
+    "sdot z23.s, z17.b, z0.b\n"
+    "ext z11.b, z11.b, z11.b, #0x1\n"
+    "ext z0.b, z0.b, z0.b, #0x1\n"
+    "sdot z24.s, z18.b, z11.b\n"
+    "ld1w { z20.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+    "sdot z21.s, z18.b, z0.b\n"
+    "sdot z22.s, z17.b, z30.b\n"
+    "incw x12\n"
+    "whilelt p0.s, x12, %x[n_channels]\n"
+    "sdot z23.s, z16.b, z30.b\n"
+    "ext z30.b, z30.b, z30.b, #0x1\n"
+    "sdot z24.s, z17.b, z0.b\n"
+    "sdot z21.s, z17.b, z30.b\n"
+    "sdot z22.s, z16.b, z25.b\n"
+    "ext z25.b, z25.b, z25.b, #0x1\n"
+    ".inst 0x04b376f7  // sqrdmulh z23.s, z23.s, z19.s\n"
+    "sdot z24.s, z16.b, z30.b\n"
+    "sdot z21.s, z16.b, z25.b\n"
+    "and z16.d, z23.d, z20.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04b37718  // sqrdmulh z24.s, z24.s, z19.s\n"
+    ".inst 0x04b376d6  // sqrdmulh z22.s, z22.s, z19.s\n"
+    ".inst 0x04b376b5  // sqrdmulh z21.s, z21.s, z19.s\n"
+    "sqadd z23.s, z23.s, z16.s\n"
+    ".inst 0x44828a97  // srshl z23.s, p2/M, z23.s, z20.s\n"
+    "ld1w { z19.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "and z18.d, z24.d, z20.d\n"
+    "and z17.d, z22.d, z20.d\n"
+    "and z16.d, z21.d, z20.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z24.s, z24.s, z18.s\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    ".inst 0x44828a98  // srshl z24.s, p2/M, z24.s, z20.s\n"
+    ".inst 0x44828a96  // srshl z22.s, p2/M, z22.s, z20.s\n"
+    "sqadd z21.s, z21.s, z16.s\n"
+    "add z23.s, z23.s, z2.s\n"
+    ".inst 0x44828a95  // srshl z21.s, p2/M, z21.s, z20.s\n"
+    "smax z23.s, p2/M, z23.s, z4.s\n"
+    "add z24.s, z24.s, z2.s\n"
+    "add z22.s, z22.s, z2.s\n"
+    "smin z23.s, p2/M, z23.s, z3.s\n"
+    "smax z24.s, p2/M, z24.s, z4.s\n"
+    "add z21.s, z21.s, z2.s\n"
+    "smax z22.s, p2/M, z22.s, z4.s\n"
+    "smax z21.s, p2/M, z21.s, z4.s\n"
+    "st1b { z23.s }, p0, [x11, x12]\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+    "ld1b { z18.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+    "smin z24.s, p2/M, z24.s, z3.s\n"
+    "smin z22.s, p2/M, z22.s, z3.s\n"
+    "smin z21.s, p2/M, z21.s, z3.s\n"
+    "st1b { z24.s }, p0, [x10, x12]\n"
+    "mov z29.d, z23.d\n"
+    "ld1b { z17.b }, p2/Z, [%x[params]]\n"
+    "st1b { z22.s }, p0, [x9, x12]\n"
+    "mov z28.d, z23.d\n"
+    "sdot z28.s, z18.b, z10.b\n"
+    "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "st1b { z21.s }, p0, [x28, x12]\n"
+    "mov z27.d, z23.d\n"
+    "sdot z23.s, z18.b, z14.b\n"
+    "sdot z23.s, z17.b, z10.b\n"
+    "ext z14.b, z14.b, z14.b, #0x1\n"
+    "ext z10.b, z10.b, z10.b, #0x1\n"
+    "sdot z29.s, z18.b, z14.b\n"
+    "ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "sdot z27.s, z18.b, z10.b\n"
+    "sdot z28.s, z17.b, z8.b\n"
+    "incw x12\n"
+    "whilelt p1.s, x12, %x[n_channels]\n"
+    "sdot z23.s, z16.b, z8.b\n"
+    "ext z8.b, z8.b, z8.b, #0x1\n"
+    "sdot z29.s, z17.b, z10.b\n"
+    "whilelt p0.b, x13, %x[n_channels]\n"
+    "sdot z27.s, z17.b, z8.b\n"
+    "sdot z28.s, z16.b, z6.b\n"
+    "ext z6.b, z6.b, z6.b, #0x1\n"
+    "ld1b { z26.b }, p0/Z, [x26, x13]\n"
+    ".inst 0x04b376f7  // sqrdmulh z23.s, z23.s, z19.s\n"
+    "sdot z29.s, z16.b, z8.b\n"
+    "sdot z27.s, z16.b, z6.b\n"
+    "ld1b { z21.b }, p0/Z, [x25, x13]\n"
+    "and z16.d, z23.d, z22.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "ld1b { z14.b }, p0/Z, [x23, x13]\n"
+    "ld1b { z25.b }, p0/Z, [x22, x13]\n"
+    ".inst 0x04b377bd  // sqrdmulh z29.s, z29.s, z19.s\n"
+    ".inst 0x04b3779c  // sqrdmulh z28.s, z28.s, z19.s\n"
+    "ld1b { z20.b }, p0/Z, [x21, x13]\n"
+    "ld1b { z10.b }, p0/Z, [x20, x13]\n"
+    ".inst 0x04b3777b  // sqrdmulh z27.s, z27.s, z19.s\n"
+    "sqadd z23.s, z23.s, z16.s\n"
+    ".inst 0x44828ad7  // srshl z23.s, p2/M, z23.s, z22.s\n"
+    "ld1b { z15.b }, p0/Z, [x27, x13]\n"
+    "and z19.d, z29.d, z22.d\n"
+    "and z17.d, z28.d, z22.d\n"
+    "ldp x23, x22, [%x[inptrs], #0x40]\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "and z16.d, z27.d, z22.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "ld1b { z9.b }, p0/Z, [x23, x13]\n"
+    "ld1b { z24.b }, p0/Z, [x22, x13]\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "ld1b { z18.b }, p0/Z, [x21, x13]\n"
+    "ld1b { z8.b }, p0/Z, [x20, x13]\n"
+    "sqadd z29.s, z29.s, z19.s\n"
+    "sqadd z28.s, z28.s, z17.s\n"
+    ".inst 0x44828add  // srshl z29.s, p2/M, z29.s, z22.s\n"
+    ".inst 0x44828adc  // srshl z28.s, p2/M, z28.s, z22.s\n"
+    "sqadd z27.s, z27.s, z16.s\n"
+    "add z23.s, z23.s, z2.s\n"
+    ".inst 0x44828adb  // srshl z27.s, p2/M, z27.s, z22.s\n"
+    "smax z23.s, p2/M, z23.s, z4.s\n"
+    "add z29.s, z29.s, z2.s\n"
+    "add z28.s, z28.s, z2.s\n"
+    "ld1b { z13.b }, p0/Z, [x24, x13]\n"
+    "ldp x23, x22, [%x[inptrs], #0x60]\n"
+    "add z27.s, z27.s, z2.s\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "smin z23.s, p2/M, z23.s, z3.s\n"
+    "smax z29.s, p2/M, z29.s, z4.s\n"
+    "smax z28.s, p2/M, z28.s, z4.s\n"
+    "smax z27.s, p2/M, z27.s, z4.s\n"
+    "st1b { z23.s }, p1, [x11, x12]\n"
+    "ld1b { z7.b }, p0/Z, [x23, x13]\n"
+    "ld1b { z23.b }, p0/Z, [x22, x13]\n"
+    "ld1b { z22.b }, p0/Z, [x21, x13]\n"
+    "zip2 z17.b, z15.b, z21.b\n"
+    "zip1 z15.b, z15.b, z21.b\n"
+    "ld1b { z6.b }, p0/Z, [x20, x13]\n"
+    "zip1 z16.b, z26.b, z14.b\n"
+    "zip2 z14.b, z26.b, z14.b\n"
+    "smin z29.s, p2/M, z29.s, z3.s\n"
+    "smin z28.s, p2/M, z28.s, z3.s\n"
+    "smin z27.s, p2/M, z27.s, z3.s\n"
+    "st1b { z29.s }, p1, [x10, x12]\n"
+    "zip2 z12.b, z15.b, z16.b\n"
+    "st1b { z28.s }, p1, [x9, x12]\n"
+    "zip1 z15.b, z15.b, z16.b\n"
+    "zip1 z11.b, z17.b, z14.b\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "st1b { z27.s }, p1, [x28, x12]\n"
+    "zip2 z14.b, z17.b, z14.b\n"
+    "zip2 z21.b, z13.b, z20.b\n"
+    "ld1w { z5.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "zip1 z13.b, z13.b, z20.b\n"
+    "zip1 z20.b, z25.b, z10.b\n"
+    "incw x12\n"
+    "ldp x25, x23, [%x[inptrs], #0x10]\n"
+    "zip2 z10.b, z25.b, z10.b\n"
+    "zip2 z19.b, z9.b, z18.b\n"
+    "ldp x24, x22, [%x[inptrs], #0x20]\n"
+    "ldp x21, x20, [%x[inptrs], #0x30]\n"
+    "zip1 z9.b, z9.b, z18.b\n"
+    "zip1 z18.b, z24.b, z8.b\n"
+    "ld1b { z29.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "ld1b { z28.b }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "zip2 z8.b, z24.b, z8.b\n"
+    "zip2 z17.b, z7.b, z22.b\n"
+    "ld1b { z26.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "addvl %x[params], %x[params], #8\n"
+    "zip1 z7.b, z7.b, z22.b\n"
+    "zip1 z16.b, z23.b, z6.b\n"
+    "zip2 z6.b, z23.b, z6.b\n"
+    "zip2 z1.b, z13.b, z20.b\n"
+    "zip1 z13.b, z13.b, z20.b\n"
+    "zip1 z0.b, z21.b, z10.b\n"
+    "zip2 z10.b, z21.b, z10.b\n"
+    "zip2 z31.b, z9.b, z18.b\n"
+    "zip1 z9.b, z9.b, z18.b\n"
+    "zip1 z30.b, z19.b, z8.b\n"
+    "zip2 z8.b, z19.b, z8.b\n"
+    "zip2 z27.b, z7.b, z16.b\n"
+    "zip1 z7.b, z7.b, z16.b\n"
+    "zip1 z25.b, z17.b, z6.b\n"
+    "zip2 z6.b, z17.b, z6.b\n"
+    "mov z24.d, z5.d\n"
+    "mov z22.d, z5.d\n"
+    "mov z21.d, z5.d\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..6b006e8d51
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleave_sve_u8q_3x3_dot::get_packed_size(args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleave_sve_u8q_3x3_dot::pack_parameters(
+      args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+      reinterpret_cast<const uint8_t *>(weights), qp, ld_weight_col, ld_weight_row
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..f0860c98b9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,497 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const uint8_t *const *const inptrs, const uint8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, uint8_t *const *const outptrs)
+{
+  __asm__ __volatile__(
+    "mov x14, #0x0\n"
+    "whilelt p0.b, x14, %x[n_channels]\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "ldp x25, x24, [%x[inptrs], #0x10]\n"
+    "ldp x23, x22, [%x[inptrs], #0x20]\n"
+    "ldp x13, x21, [%x[inptrs], #0x30]\n"
+    "mov x20, #0x1\n"
+    "ptrue p2.b\n"
+    "ldp x12, x11, [%x[outptrs], #0x0]\n"
+    "ldp x10, x9, [%x[outptrs], #0x10]\n"
+    "orr x20, x20, #0x100\n"
+    "orr x20, x20, #0x10000\n"
+    "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+    "ld1b { z21.b }, p0/Z, [x26, x14]\n"
+    "dup z25.s, w20\n"
+    "mov x28, #0x0\n"
+    "ldp x27, x26, [%x[inptrs], #0x40]\n"
+    "ld1b { z31.b }, p0/Z, [x25, x14]\n"
+    "zip2 z16.b, z15.b, z31.b\n"
+    "zip1 z15.b, z15.b, z31.b\n"
+    "ld1b { z29.b }, p0/Z, [x24, x14]\n"
+    "ldp x25, x24, [%x[inptrs], #0x50]\n"
+    "zip1 z30.b, z21.b, z29.b\n"
+    "zip2 z29.b, z21.b, z29.b\n"
+    "ld1b { z9.b }, p0/Z, [x23, x14]\n"
+    "ld1b { z20.b }, p0/Z, [x22, x14]\n"
+    "zip2 z13.b, z15.b, z30.b\n"
+    "zip1 z15.b, z15.b, z30.b\n"
+    "ldp x23, x22, [%x[inptrs], #0x60]\n"
+    "ld1b { z5.b }, p0/Z, [x13, x14]\n"
+    "zip1 z14.b, z16.b, z29.b\n"
+    "zip2 z29.b, z16.b, z29.b\n"
+    "ld1b { z17.b }, p0/Z, [x21, x14]\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "zip2 z31.b, z9.b, z5.b\n"
+    "zip1 z9.b, z9.b, z5.b\n"
+    "ld1b { z18.b }, p0/Z, [x27, x14]\n"
+    "ld1b { z28.b }, p0/Z, [x26, x14]\n"
+    "zip1 z21.b, z20.b, z17.b\n"
+    "zip2 z17.b, z20.b, z17.b\n"
+    "ld1b { z6.b }, p0/Z, [x25, x14]\n"
+    "ld1b { z4.b }, p0/Z, [x24, x14]\n"
+    "zip2 z23.b, z18.b, z6.b\n"
+    "zip1 z18.b, z18.b, z6.b\n"
+    "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+    "ld1b { z19.b }, p0/Z, [x22, x14]\n"
+    "zip1 z24.b, z28.b, z4.b\n"
+    "zip2 z4.b, z28.b, z4.b\n"
+    "ld1b { z16.b }, p0/Z, [x21, x14]\n"
+    "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+    "zip2 z22.b, z2.b, z16.b\n"
+    "zip1 z2.b, z2.b, z16.b\n"
+    "zip1 z0.b, z19.b, z5.b\n"
+    "zip2 z5.b, z19.b, z5.b\n"
+    "ld1w { z10.s }, p2/Z, [%x[params]]\n"
+    "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "zip2 z19.b, z9.b, z21.b\n"
+    "zip1 z9.b, z9.b, z21.b\n"
+    "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "zip1 z11.b, z31.b, z17.b\n"
+    "zip2 z17.b, z31.b, z17.b\n"
+    "ldp x25, x23, [%x[inptrs], #0x10]\n"
+    "ldp x24, x22, [%x[inptrs], #0x20]\n"
+    "zip2 z12.b, z18.b, z24.b\n"
+    "zip1 z18.b, z18.b, z24.b\n"
+    "ldp x21, x20, [%x[inptrs], #0x30]\n"
+    "zip1 z20.b, z23.b, z4.b\n"
+    "zip2 z4.b, z23.b, z4.b\n"
+    "ld1b { z26.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "zip2 z24.b, z2.b, z0.b\n"
+    "zip1 z2.b, z2.b, z0.b\n"
+    "ld1b { z3.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "zip1 z0.b, z22.b, z5.b\n"
+    "zip2 z5.b, z22.b, z5.b\n"
+    "addvl %x[params], %x[params], #4\n"
+    "mov z22.d, z10.d\n"
+    "mov z31.d, z10.d\n"
+    "mov z21.d, z10.d\n"
+    "1:"  // Loop
+    "mov z30.s, #0x0\n"
+    "udot z30.s, z25.b, z9.b\n"
+    "udot z10.s, z26.b, z15.b\n"
+    "whilelt p0.s, x28, %x[n_channels]\n"
+    "udot z30.s, z25.b, z18.b\n"
+    "udot z31.s, z26.b, z9.b\n"
+    "mov z27.s, #0x0\n"
+    "incw x14, ALL, MUL #4\n"
+    "udot z10.s, z3.b, z9.b\n"
+    "ext z9.b, z9.b, z9.b, #0x1\n"
+    "movprfx z28, z30\n udot z28.s, z25.b, z2.b\n"
+    "udot z30.s, z25.b, z15.b\n"
+    "ext z15.b, z15.b, z15.b, #0x1\n"
+    "udot z27.s, z25.b, z9.b\n"
+    "udot z31.s, z3.b, z18.b\n"
+    "udot z10.s, z1.b, z18.b\n"
+    "ext z18.b, z18.b, z18.b, #0x1\n"
+    "udot z22.s, z26.b, z15.b\n"
+    "udot z21.s, z26.b, z9.b\n"
+    "udot z27.s, z25.b, z18.b\n"
+    "udot z31.s, z1.b, z2.b\n"
+    "ext z2.b, z2.b, z2.b, #0x1\n"
+    "udot z22.s, z3.b, z9.b\n"
+    "udot z21.s, z3.b, z18.b\n"
+    "ld1w { z3.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "mls z10.s, p2/M, z30.s, z8.s\n"
+    "movprfx z26, z27\n udot z26.s, z25.b, z2.b\n"
+    "mov z9.s, #0x0\n"
+    "udot z27.s, z25.b, z15.b\n"
+    "ld1w { z23.s }, p2/Z, [%x[params]]\n"
+    "udot z22.s, z1.b, z18.b\n"
+    ".inst 0x04b7754a  // sqrdmulh z10.s, z10.s, z23.s\n"
+    "udot z21.s, z1.b, z2.b\n"
+    "mls z22.s, p2/M, z27.s, z8.s\n"
+    "and z18.d, z10.d, z3.d\n"
+    "mls z31.s, p2/M, z28.s, z8.s\n"
+    "mls z21.s, p2/M, z26.s, z8.s\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    ".inst 0x04b776d6  // sqrdmulh z22.s, z22.s, z23.s\n"
+    ".inst 0x04b777ff  // sqrdmulh z31.s, z31.s, z23.s\n"
+    "udot z9.s, z25.b, z19.b\n"
+    ".inst 0x04b776b5  // sqrdmulh z21.s, z21.s, z23.s\n"
+    "sqadd z10.s, z10.s, z18.s\n"
+    ".inst 0x4482886a  // srshl z10.s, p2/M, z10.s, z3.s\n"
+    "udot z9.s, z25.b, z12.b\n"
+    "and z28.d, z22.d, z3.d\n"
+    "and z23.d, z31.d, z3.d\n"
+    "movprfx z27, z9\n udot z27.s, z25.b, z24.b\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "and z18.d, z21.d, z3.d\n"
+    "asr z28.s, z28.s, #0x1f\n"
+    "udot z9.s, z25.b, z13.b\n"
+    "asr z23.s, z23.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z22.s, z22.s, z28.s\n"
+    "sqadd z31.s, z31.s, z23.s\n"
+    ".inst 0x44828876  // srshl z22.s, p2/M, z22.s, z3.s\n"
+    ".inst 0x4482887f  // srshl z31.s, p2/M, z31.s, z3.s\n"
+    "sqadd z21.s, z21.s, z18.s\n"
+    "add z10.s, z10.s, z16.s\n"
+    ".inst 0x44828875  // srshl z21.s, p2/M, z21.s, z3.s\n"
+    "smax z10.s, p2/M, z10.s, z7.s\n"
+    "add z22.s, z22.s, z16.s\n"
+    "add z31.s, z31.s, z16.s\n"
+    "smin z10.s, p2/M, z10.s, z6.s\n"
+    "smax z22.s, p2/M, z22.s, z7.s\n"
+    "add z21.s, z21.s, z16.s\n"
+    "smax z31.s, p2/M, z31.s, z7.s\n"
+    "smax z21.s, p2/M, z21.s, z7.s\n"
+    "st1b { z10.s }, p0, [x12, x28]\n"
+    "ld1w { z28.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "smin z22.s, p2/M, z22.s, z6.s\n"
+    "smin z31.s, p2/M, z31.s, z6.s\n"
+    "smin z21.s, p2/M, z21.s, z6.s\n"
+    "st1b { z22.s }, p0, [x11, x28]\n"
+    "mov z26.d, z28.d\n"
+    "ld1b { z15.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "st1b { z31.s }, p0, [x10, x28]\n"
+    "mov z31.d, z28.d\n"
+    "udot z31.s, z1.b, z19.b\n"
+    "ld1b { z23.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "st1b { z21.s }, p0, [x9, x28]\n"
+    "mov z22.d, z28.d\n"
+    "udot z28.s, z1.b, z13.b\n"
+    "udot z28.s, z15.b, z19.b\n"
+    "ext z13.b, z13.b, z13.b, #0x1\n"
+    "ext z19.b, z19.b, z19.b, #0x1\n"
+    "udot z26.s, z1.b, z13.b\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "mov z18.s, #0x0\n"
+    "udot z22.s, z1.b, z19.b\n"
+    "udot z18.s, z25.b, z19.b\n"
+    "incw x28\n"
+    "udot z31.s, z15.b, z12.b\n"
+    "udot z28.s, z23.b, z12.b\n"
+    "ext z12.b, z12.b, z12.b, #0x1\n"
+    "whilelt p0.s, x28, %x[n_channels]\n"
+    "udot z26.s, z15.b, z19.b\n"
+    "udot z22.s, z15.b, z12.b\n"
+    "addvl %x[params], %x[params], #16\n"
+    "udot z18.s, z25.b, z12.b\n"
+    "udot z31.s, z23.b, z24.b\n"
+    "ext z24.b, z24.b, z24.b, #0x1\n"
+    "mls z28.s, p2/M, z9.s, z8.s\n"
+    "udot z26.s, z23.b, z12.b\n"
+    ".inst 0x04be779c  // sqrdmulh z28.s, z28.s, z30.s\n"
+    "udot z22.s, z23.b, z24.b\n"
+    "movprfx z12, z18\n udot z12.s, z25.b, z24.b\n"
+    "and z2.d, z28.d, z21.d\n"
+    "udot z18.s, z25.b, z13.b\n"
+    "mls z26.s, p2/M, z18.s, z8.s\n"
+    "asr z2.s, z2.s, #0x1f\n"
+    "mls z31.s, p2/M, z27.s, z8.s\n"
+    "mls z22.s, p2/M, z12.s, z8.s\n"
+    ".inst 0x04be775a  // sqrdmulh z26.s, z26.s, z30.s\n"
+    ".inst 0x04be77ff  // sqrdmulh z31.s, z31.s, z30.s\n"
+    ".inst 0x04be76d6  // sqrdmulh z22.s, z22.s, z30.s\n"
+    "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+    "sqadd z28.s, z28.s, z2.s\n"
+    "and z24.d, z26.d, z21.d\n"
+    ".inst 0x44828abc  // srshl z28.s, p2/M, z28.s, z21.s\n"
+    "and z23.d, z31.d, z21.d\n"
+    "and z18.d, z22.d, z21.d\n"
+    "asr z24.s, z24.s, #0x1f\n"
+    "asr z23.s, z23.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z26.s, z26.s, z24.s\n"
+    ".inst 0x44828aba  // srshl z26.s, p2/M, z26.s, z21.s\n"
+    "ld1b { z30.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
+    "sqadd z31.s, z31.s, z23.s\n"
+    "sqadd z22.s, z22.s, z18.s\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    ".inst 0x44828ab6  // srshl z22.s, p2/M, z22.s, z21.s\n"
+    "add z28.s, z28.s, z16.s\n"
+    "smax z28.s, p2/M, z28.s, z7.s\n"
+    "add z26.s, z26.s, z16.s\n"
+    "smin z28.s, p2/M, z28.s, z6.s\n"
+    "add z31.s, z31.s, z16.s\n"
+    "add z22.s, z22.s, z16.s\n"
+    "smax z26.s, p2/M, z26.s, z7.s\n"
+    "smax z31.s, p2/M, z31.s, z7.s\n"
+    "mov z24.s, #0x0\n"
+    "udot z24.s, z25.b, z11.b\n"
+    "smax z22.s, p2/M, z22.s, z7.s\n"
+    "st1b { z28.s }, p0, [x12, x28]\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "ld1b { z19.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "smin z26.s, p2/M, z26.s, z6.s\n"
+    "smin z31.s, p2/M, z31.s, z6.s\n"
+    "smin z22.s, p2/M, z22.s, z6.s\n"
+    "st1b { z26.s }, p0, [x11, x28]\n"
+    "mov z28.d, z23.d\n"
+    "udot z24.s, z25.b, z20.b\n"
+    "st1b { z31.s }, p0, [x10, x28]\n"
+    "mov z27.d, z23.d\n"
+    "udot z27.s, z19.b, z11.b\n"
+    "movprfx z13, z24\n udot z13.s, z25.b, z0.b\n"
+    "st1b { z22.s }, p0, [x9, x28]\n"
+    "mov z26.d, z23.d\n"
+    "udot z23.s, z19.b, z14.b\n"
+    "udot z23.s, z30.b, z11.b\n"
+    "udot z24.s, z25.b, z14.b\n"
+    "ext z14.b, z14.b, z14.b, #0x1\n"
+    "ld1b { z21.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+    "udot z28.s, z19.b, z14.b\n"
+    "ext z11.b, z11.b, z11.b, #0x1\n"
+    "mov z12.s, #0x0\n"
+    "udot z26.s, z19.b, z11.b\n"
+    "ld1w { z22.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+    "udot z12.s, z25.b, z11.b\n"
+    "udot z27.s, z30.b, z20.b\n"
+    "incw x28\n"
+    "whilelt p0.s, x28, %x[n_channels]\n"
+    "udot z23.s, z21.b, z20.b\n"
+    "ext z20.b, z20.b, z20.b, #0x1\n"
+    "udot z28.s, z30.b, z11.b\n"
+    "udot z26.s, z30.b, z20.b\n"
+    "udot z12.s, z25.b, z20.b\n"
+    "udot z27.s, z21.b, z0.b\n"
+    "ext z0.b, z0.b, z0.b, #0x1\n"
+    "mls z23.s, p2/M, z24.s, z8.s\n"
+    "udot z28.s, z21.b, z20.b\n"
+    "udot z26.s, z21.b, z0.b\n"
+    ".inst 0x04a176f7  // sqrdmulh z23.s, z23.s, z1.s\n"
+    "movprfx z19, z12\n udot z19.s, z25.b, z0.b\n"
+    "udot z12.s, z25.b, z14.b\n"
+    "and z18.d, z23.d, z22.d\n"
+    "mls z28.s, p2/M, z12.s, z8.s\n"
+    "mls z27.s, p2/M, z13.s, z8.s\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "mls z26.s, p2/M, z19.s, z8.s\n"
+    ".inst 0x04a1779c  // sqrdmulh z28.s, z28.s, z1.s\n"
+    ".inst 0x04a1777b  // sqrdmulh z27.s, z27.s, z1.s\n"
+    ".inst 0x04a1775a  // sqrdmulh z26.s, z26.s, z1.s\n"
+    "ld1w { z2.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "sqadd z23.s, z23.s, z18.s\n"
+    "and z20.d, z28.d, z22.d\n"
+    ".inst 0x44828ad7  // srshl z23.s, p2/M, z23.s, z22.s\n"
+    "and z19.d, z27.d, z22.d\n"
+    "and z18.d, z26.d, z22.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z28.s, z28.s, z20.s\n"
+    ".inst 0x44828adc  // srshl z28.s, p2/M, z28.s, z22.s\n"
+    "ld1b { z13.b }, p2/Z, [%x[params]]\n"
+    "sqadd z27.s, z27.s, z19.s\n"
+    "sqadd z26.s, z26.s, z18.s\n"
+    ".inst 0x44828adb  // srshl z27.s, p2/M, z27.s, z22.s\n"
+    ".inst 0x44828ada  // srshl z26.s, p2/M, z26.s, z22.s\n"
+    "add z23.s, z23.s, z16.s\n"
+    "smax z23.s, p2/M, z23.s, z7.s\n"
+    "add z28.s, z28.s, z16.s\n"
+    "smin z23.s, p2/M, z23.s, z6.s\n"
+    "add z27.s, z27.s, z16.s\n"
+    "add z26.s, z26.s, z16.s\n"
+    "smax z28.s, p2/M, z28.s, z7.s\n"
+    "smax z27.s, p2/M, z27.s, z7.s\n"
+    "mov z24.s, #0x0\n"
+    "udot z24.s, z25.b, z17.b\n"
+    "smax z26.s, p2/M, z26.s, z7.s\n"
+    "st1b { z23.s }, p0, [x12, x28]\n"
+    "ld1w { z1.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+    "ld1b { z21.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+    "smin z28.s, p2/M, z28.s, z6.s\n"
+    "smin z27.s, p2/M, z27.s, z6.s\n"
+    "smin z26.s, p2/M, z26.s, z6.s\n"
+    "st1b { z28.s }, p0, [x11, x28]\n"
+    "mov z0.d, z1.d\n"
+    "udot z24.s, z25.b, z4.b\n"
+    "st1b { z27.s }, p0, [x10, x28]\n"
+    "mov z31.d, z1.d\n"
+    "udot z31.s, z21.b, z17.b\n"
+    "movprfx z23, z24\n udot z23.s, z25.b, z5.b\n"
+    "st1b { z26.s }, p0, [x9, x28]\n"
+    "mov z30.d, z1.d\n"
+    "udot z1.s, z21.b, z29.b\n"
+    "udot z1.s, z13.b, z17.b\n"
+    "udot z24.s, z25.b, z29.b\n"
+    "ext z29.b, z29.b, z29.b, #0x1\n"
+    "ld1b { z20.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "udot z0.s, z21.b, z29.b\n"
+    "ext z17.b, z17.b, z17.b, #0x1\n"
+    "mov z19.s, #0x0\n"
+    "udot z30.s, z21.b, z17.b\n"
+    "ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "udot z19.s, z25.b, z17.b\n"
+    "udot z31.s, z13.b, z4.b\n"
+    "incw x28\n"
+    "whilelt p1.s, x28, %x[n_channels]\n"
+    "udot z1.s, z20.b, z4.b\n"
+    "ext z4.b, z4.b, z4.b, #0x1\n"
+    "udot z0.s, z13.b, z17.b\n"
+    "whilelt p0.b, x14, %x[n_channels]\n"
+    "udot z30.s, z13.b, z4.b\n"
+    "udot z19.s, z25.b, z4.b\n"
+    "ld1b { z13.b }, p0/Z, [x26, x14]\n"
+    "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+    "udot z31.s, z20.b, z5.b\n"
+    "ext z5.b, z5.b, z5.b, #0x1\n"
+    "mls z1.s, p2/M, z24.s, z8.s\n"
+    "ld1b { z27.b }, p0/Z, [x22, x14]\n"
+    "udot z0.s, z20.b, z4.b\n"
+    "udot z30.s, z20.b, z5.b\n"
+    ".inst 0x04a27421  // sqrdmulh z1.s, z1.s, z2.s\n"
+    "ld1b { z26.b }, p0/Z, [x21, x14]\n"
+    "movprfx z18, z19\n udot z18.s, z25.b, z5.b\n"
+    "udot z19.s, z25.b, z29.b\n"
+    "and z11.d, z1.d, z22.d\n"
+    "ld1b { z29.b }, p0/Z, [x23, x14]\n"
+    "mls z0.s, p2/M, z19.s, z8.s\n"
+    "mls z31.s, p2/M, z23.s, z8.s\n"
+    "asr z11.s, z11.s, #0x1f\n"
+    "ld1b { z17.b }, p0/Z, [x20, x14]\n"
+    "mls z30.s, p2/M, z18.s, z8.s\n"
+    ".inst 0x04a27400  // sqrdmulh z0.s, z0.s, z2.s\n"
+    ".inst 0x04a277ff  // sqrdmulh z31.s, z31.s, z2.s\n"
+    ".inst 0x04a277de  // sqrdmulh z30.s, z30.s, z2.s\n"
+    "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+    "ldp x23, x22, [%x[inptrs], #0x40]\n"
+    "sqadd z1.s, z1.s, z11.s\n"
+    "and z21.d, z0.d, z22.d\n"
+    ".inst 0x44828ac1  // srshl z1.s, p2/M, z1.s, z22.s\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "and z20.d, z31.d, z22.d\n"
+    "and z19.d, z30.d, z22.d\n"
+    "ld1b { z18.b }, p0/Z, [x23, x14]\n"
+    "ld1b { z11.b }, p0/Z, [x22, x14]\n"
+    "asr z21.s, z21.s, #0x1f\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "ld1b { z24.b }, p0/Z, [x21, x14]\n"
+    "ld1b { z4.b }, p0/Z, [x20, x14]\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "sqadd z0.s, z0.s, z21.s\n"
+    ".inst 0x44828ac0  // srshl z0.s, p2/M, z0.s, z22.s\n"
+    "ld1b { z3.b }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "sqadd z31.s, z31.s, z20.s\n"
+    "sqadd z30.s, z30.s, z19.s\n"
+    ".inst 0x44828adf  // srshl z31.s, p2/M, z31.s, z22.s\n"
+    ".inst 0x44828ade  // srshl z30.s, p2/M, z30.s, z22.s\n"
+    "add z1.s, z1.s, z16.s\n"
+    "smax z1.s, p2/M, z1.s, z7.s\n"
+    "add z0.s, z0.s, z16.s\n"
+    "ld1b { z9.b }, p0/Z, [x24, x14]\n"
+    "add z31.s, z31.s, z16.s\n"
+    "add z30.s, z30.s, z16.s\n"
+    "ldp x23, x22, [%x[inptrs], #0x60]\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "smin z1.s, p2/M, z1.s, z6.s\n"
+    "smax z0.s, p2/M, z0.s, z7.s\n"
+    "st1b { z1.s }, p1, [x12, x28]\n"
+    "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+    "smax z31.s, p2/M, z31.s, z7.s\n"
+    "smax z30.s, p2/M, z30.s, z7.s\n"
+    "ld1b { z23.b }, p0/Z, [x22, x14]\n"
+    "ld1b { z22.b }, p0/Z, [x21, x14]\n"
+    "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+    "zip2 z20.b, z15.b, z28.b\n"
+    "zip1 z15.b, z15.b, z28.b\n"
+    "smin z0.s, p2/M, z0.s, z6.s\n"
+    "zip1 z19.b, z13.b, z29.b\n"
+    "zip2 z29.b, z13.b, z29.b\n"
+    "smin z31.s, p2/M, z31.s, z6.s\n"
+    "smin z30.s, p2/M, z30.s, z6.s\n"
+    "st1b { z0.s }, p1, [x11, x28]\n"
+    "zip2 z13.b, z15.b, z19.b\n"
+    "zip1 z15.b, z15.b, z19.b\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "st1b { z31.s }, p1, [x10, x28]\n"
+    "zip1 z14.b, z20.b, z29.b\n"
+    "zip2 z29.b, z20.b, z29.b\n"
+    "ld1w { z10.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "st1b { z30.s }, p1, [x9, x28]\n"
+    "zip2 z21.b, z9.b, z26.b\n"
+    "zip1 z9.b, z9.b, z26.b\n"
+    "incw x28\n"
+    "zip1 z20.b, z27.b, z17.b\n"
+    "zip2 z17.b, z27.b, z17.b\n"
+    "ldp x25, x23, [%x[inptrs], #0x10]\n"
+    "ldp x24, x22, [%x[inptrs], #0x20]\n"
+    "zip2 z31.b, z18.b, z24.b\n"
+    "zip1 z18.b, z18.b, z24.b\n"
+    "ldp x21, x20, [%x[inptrs], #0x30]\n"
+    "ld1b { z26.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "zip1 z27.b, z11.b, z4.b\n"
+    "zip2 z4.b, z11.b, z4.b\n"
+    "ld1b { z1.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "addvl %x[params], %x[params], #8\n"
+    "zip2 z30.b, z2.b, z22.b\n"
+    "zip1 z2.b, z2.b, z22.b\n"
+    "zip1 z28.b, z23.b, z5.b\n"
+    "zip2 z5.b, z23.b, z5.b\n"
+    "zip2 z19.b, z9.b, z20.b\n"
+    "zip1 z9.b, z9.b, z20.b\n"
+    "zip1 z11.b, z21.b, z17.b\n"
+    "zip2 z17.b, z21.b, z17.b\n"
+    "zip2 z12.b, z18.b, z27.b\n"
+    "zip1 z18.b, z18.b, z27.b\n"
+    "zip1 z20.b, z31.b, z4.b\n"
+    "zip2 z4.b, z31.b, z4.b\n"
+    "zip2 z24.b, z2.b, z28.b\n"
+    "zip1 z2.b, z2.b, z28.b\n"
+    "zip1 z0.b, z30.b, z5.b\n"
+    "zip2 z5.b, z30.b, z5.b\n"
+    "mov z22.d, z10.d\n"
+    "mov z31.d, z10.d\n"
+    "mov z21.d, z10.d\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..0300b71d7c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..5c26010c0d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x16, #0x0\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+    "ptrue p4.b\n"
+    "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "mov x23, x16\n"
+    "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+    "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z12.b }, p4/Z, [x21]\n"
+    "ld1rb { z30.b }, p4/Z, [x20]\n"
+    "add x21, x25, %[offsetof_Requantize32_minval]\n"
+    "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z24.h }, p4/Z, [x22]\n"
+    "ld1rh { z11.h }, p4/Z, [x21]\n"
+    "ld1rh { z26.h }, p4/Z, [x20]\n"
+    "ldp x13, x12, [x24, #0x0]\n"
+    "incw x23\n"
+    "whilelt p3.h, x16, x15\n"
+    "ldp x11, x10, [x24, #0x10]\n"
+    "whilelt p2.s, x16, x15\n"
+    "whilelt p1.s, x23, x15\n"
+    "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1b { z14.h }, p4/Z, [x14]\n"
+    "ld1b { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+    "add x28, %x[params], %[offsetof_Params_inptrs]\n"
+    "mov x27, #0x0\n"
+    "ld1b { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+    "ld1b { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+    ".inst 0x455e19ce  // usublb z14.h, z14.b, z30.b\n"
+    ".inst 0x455e1ab5  // usublb z21.h, z21.b, z30.b\n"
+    "ld1b { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+    "ld1b { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+    ".inst 0x455e1821  // usublb z1.h, z1.b, z30.b\n"
+    ".inst 0x455e18c6  // usublb z6.h, z6.b, z30.b\n"
+    "ld1b { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+    "ld1b { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+    "inch x14, ALL, MUL #8\n"
+    ".inst 0x455e1842  // usublb z2.h, z2.b, z30.b\n"
+    "ld1w { z17.s }, p2/Z, [x9]\n"
+    "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
+    "uzp1 z5.s, z17.s, z16.s\n"
+    "uzp2 z9.s, z17.s, z16.s\n"
+    "ld1b { z8.h }, p4/Z, [x14]\n"
+    "ldp x24, x23, [x28, #0x0]\n"
+    "addvl x9, x9, #2\n"
+    "mov z17.d, z5.d\n"
+    "ldp x22, x21, [x28, #0x10]\n"
+    "ldr x20, [x28, #0x20]\n"
+    "mov z25.d, z9.d\n"
+    "mov z16.d, z5.d\n"
+    "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+    "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+    "mov z23.d, z9.d\n"
+    "mov z22.d, z5.d\n"
+    "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+    "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+    "mov z27.d, z9.d\n"
+    ".inst 0x455e1a52  // usublb z18.h, z18.b, z30.b\n"
+    "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+    "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    ".inst 0x455e18e7  // usublb z7.h, z7.b, z30.b\n"
+    ".inst 0x455e194a  // usublb z10.h, z10.b, z30.b\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x9, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455e1908  // usublb z8.h, z8.b, z30.b\n"
+    ".inst 0x454c1800  // usublb z0.h, z0.b, z12.b\n"
+    ".inst 0x454c1bbd  // usublb z29.h, z29.b, z12.b\n"
+    ".inst 0x454c1884  // usublb z4.h, z4.b, z12.b\n"
+    ".inst 0x454c19ad  // usublb z13.h, z13.b, z12.b\n"
+    ".inst 0x454c1a94  // usublb z20.h, z20.b, z12.b\n"
+    "1:"  // Loop
+    ".inst 0x44824005  // smlalb z5.s, p4/M, z0.h, z2.h\n"
+    ".inst 0x44824409  // smlalt z9.s, p4/M, z0.h, z2.h\n"
+    "ldr x20, [x28, #0x28]\n"
+    "ldr x21, [x28, #0x38]\n"
+    ".inst 0x448e43a5  // smlalb z5.s, p4/M, z29.h, z14.h\n"
+    ".inst 0x44864011  // smlalb z17.s, p4/M, z0.h, z6.h\n"
+    "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+    "ldr x20, [x28, #0x30]\n"
+    ".inst 0x44954010  // smlalb z16.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x448e4016  // smlalb z22.s, p4/M, z0.h, z14.h\n"
+    "ld1b { z31.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x454c1863  // usublb z3.h, z3.b, z12.b\n"
+    ".inst 0x448e47a9  // smlalt z9.s, p4/M, z29.h, z14.h\n"
+    ".inst 0x449241a5  // smlalb z5.s, p4/M, z13.h, z18.h\n"
+    "ldr x21, [x28, #0x40]\n"
+    "ld1b { z15.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x44864419  // smlalt z25.s, p4/M, z0.h, z6.h\n"
+    ".inst 0x44954417  // smlalt z23.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x454c1bff  // usublb z31.h, z31.b, z12.b\n"
+    "ldr x20, [x28, #0x48]\n"
+    ".inst 0x448e441b  // smlalt z27.s, p4/M, z0.h, z14.h\n"
+    ".inst 0x44814091  // smlalb z17.s, p4/M, z4.h, z1.h\n"
+    "ld1b { z19.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x454c19ef  // usublb z15.h, z15.b, z12.b\n"
+    ".inst 0x448141b0  // smlalb z16.s, p4/M, z13.h, z1.h\n"
+    ".inst 0x449541b6  // smlalb z22.s, p4/M, z13.h, z21.h\n"
+    "ld1b { z28.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x454c1a73  // usublb z19.h, z19.b, z12.b\n"
+    ".inst 0x449245a9  // smlalt z9.s, p4/M, z13.h, z18.h\n"
+    ".inst 0x448a4285  // smlalb z5.s, p4/M, z20.h, z10.h\n"
+    "ldr x21, [x28, #0x50]\n"
+    "ldr x20, [x28, #0x58]\n"
+    ".inst 0x44814499  // smlalt z25.s, p4/M, z4.h, z1.h\n"
+    ".inst 0x448145b7  // smlalt z23.s, p4/M, z13.h, z1.h\n"
+    ".inst 0x454c1b9c  // usublb z28.h, z28.b, z12.b\n"
+    "ld1b { z4.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x449545bb  // smlalt z27.s, p4/M, z13.h, z21.h\n"
+    ".inst 0x448241b1  // smlalb z17.s, p4/M, z13.h, z2.h\n"
+    "ld1b { z29.h }, p3/Z, [x20, x16]\n"
+    "ldr x21, [x28, #0x60]\n"
+    ".inst 0x44874070  // smlalb z16.s, p4/M, z3.h, z7.h\n"
+    ".inst 0x44864296  // smlalb z22.s, p4/M, z20.h, z6.h\n"
+    "ldr x20, [x28, #0x68]\n"
+    ".inst 0x454c1884  // usublb z4.h, z4.b, z12.b\n"
+    ".inst 0x448a4689  // smlalt z9.s, p4/M, z20.h, z10.h\n"
+    ".inst 0x449543e5  // smlalb z5.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454c1bbd  // usublb z29.h, z29.b, z12.b\n"
+    "ld1b { z0.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x448245b9  // smlalt z25.s, p4/M, z13.h, z2.h\n"
+    ".inst 0x44874477  // smlalt z23.s, p4/M, z3.h, z7.h\n"
+    "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+    "ldr x20, [x28, #0x70]\n"
+    ".inst 0x4486469b  // smlalt z27.s, p4/M, z20.h, z6.h\n"
+    ".inst 0x44874291  // smlalb z17.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x454c1800  // usublb z0.h, z0.b, z12.b\n"
+    "ld1b { z13.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x44824290  // smlalb z16.s, p4/M, z20.h, z2.h\n"
+    ".inst 0x448841f6  // smlalb z22.s, p4/M, z15.h, z8.h\n"
+    ".inst 0x454c1863  // usublb z3.h, z3.b, z12.b\n"
+    "ldr x20, [x28, #0x78]\n"
+    ".inst 0x449547e9  // smlalt z9.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x44814265  // smlalb z5.s, p4/M, z19.h, z1.h\n"
+    ".inst 0x454c19ad  // usublb z13.h, z13.b, z12.b\n"
+    "whilelt p0.h, x27, x15\n"
+    ".inst 0x44874699  // smlalt z25.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x44824697  // smlalt z23.s, p4/M, z20.h, z2.h\n"
+    "ld1w { z20.s }, p2/Z, [x26]\n"
+    "inch x14\n"
+    ".inst 0x448845fb  // smlalt z27.s, p4/M, z15.h, z8.h\n"
+    ".inst 0x448e43f1  // smlalb z17.s, p4/M, z31.h, z14.h\n"
+    "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44924390  // smlalb z16.s, p4/M, z28.h, z18.h\n"
+    ".inst 0x44824396  // smlalb z22.s, p4/M, z28.h, z2.h\n"
+    "addvl x26, x26, #2\n"
+    ".inst 0x44814669  // smlalt z9.s, p4/M, z19.h, z1.h\n"
+    ".inst 0x44884385  // smlalb z5.s, p4/M, z28.h, z8.h\n"
+    ".inst 0x448e47f9  // smlalt z25.s, p4/M, z31.h, z14.h\n"
+    ".inst 0x44924797  // smlalt z23.s, p4/M, z28.h, z18.h\n"
+    "ld1b { z31.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x454c1bff  // usublb z31.h, z31.b, z12.b\n"
+    ".inst 0x4482479b  // smlalt z27.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x44954271  // smlalb z17.s, p4/M, z19.h, z21.h\n"
+    "uzp1 z2.s, z20.s, z15.s\n"
+    "inch x16\n"
+    ".inst 0x448e4090  // smlalb z16.s, p4/M, z4.h, z14.h\n"
+    ".inst 0x448143b6  // smlalb z22.s, p4/M, z29.h, z1.h\n"
+    "uzp2 z15.s, z20.s, z15.s\n"
+    "ld1w { z20.s }, p2/Z, [x25]\n"
+    ".inst 0x44884789  // smlalt z9.s, p4/M, z28.h, z8.h\n"
+    ".inst 0x44864085  // smlalb z5.s, p4/M, z4.h, z6.h\n"
+    "mov x20, x16\n"
+    "incw x20\n"
+    ".inst 0x44954679  // smlalt z25.s, p4/M, z19.h, z21.h\n"
+    ".inst 0x448e4497  // smlalt z23.s, p4/M, z4.h, z14.h\n"
+    "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
+    "uzp1 z21.s, z20.s, z19.s\n"
+    ".inst 0x448147bb  // smlalt z27.s, p4/M, z29.h, z1.h\n"
+    ".inst 0x448a4391  // smlalb z17.s, p4/M, z28.h, z10.h\n"
+    "uzp2 z1.s, z20.s, z19.s\n"
+    "whilelt p2.s, x16, x15\n"
+    ".inst 0x44864010  // smlalb z16.s, p4/M, z0.h, z6.h\n"
+    ".inst 0x44924076  // smlalb z22.s, p4/M, z3.h, z18.h\n"
+    "whilelt p1.s, x20, x15\n"
+    "whilelt p3.h, x16, x15\n"
+    ".inst 0x44864489  // smlalt z9.s, p4/M, z4.h, z6.h\n"
+    ".inst 0x44874005  // smlalb z5.s, p4/M, z0.h, z7.h\n"
+    ".inst 0x04a274a5  // sqrdmulh z5.s, z5.s, z2.s\n"
+    "addvl x25, x25, #2\n"
+    ".inst 0x448a4799  // smlalt z25.s, p4/M, z28.h, z10.h\n"
+    ".inst 0x44864417  // smlalt z23.s, p4/M, z0.h, z6.h\n"
+    "and z19.d, z5.d, z21.d\n"
+    ".inst 0x4492447b  // smlalt z27.s, p4/M, z3.h, z18.h\n"
+    ".inst 0x449243b1  // smlalb z17.s, p4/M, z29.h, z18.h\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    ".inst 0x448a41b0  // smlalb z16.s, p4/M, z13.h, z10.h\n"
+    ".inst 0x448741b6  // smlalb z22.s, p4/M, z13.h, z7.h\n"
+    "sqadd z5.s, z5.s, z19.s\n"
+    ".inst 0x448292a5  // srshl z5.s, p4/M, z5.s, z21.s\n"
+    ".inst 0x44874409  // smlalt z9.s, p4/M, z0.h, z7.h\n"
+    ".inst 0x449247b9  // smlalt z25.s, p4/M, z29.h, z18.h\n"
+    ".inst 0x04af7529  // sqrdmulh z9.s, z9.s, z15.s\n"
+    ".inst 0x448a45b7  // smlalt z23.s, p4/M, z13.h, z10.h\n"
+    ".inst 0x448745bb  // smlalt z27.s, p4/M, z13.h, z7.h\n"
+    "and z29.d, z9.d, z1.d\n"
+    ".inst 0x44884071  // smlalb z17.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x448843f0  // smlalb z16.s, p4/M, z31.h, z8.h\n"
+    ".inst 0x04a27631  // sqrdmulh z17.s, z17.s, z2.s\n"
+    ".inst 0x448a43f6  // smlalb z22.s, p4/M, z31.h, z10.h\n"
+    ".inst 0x44884479  // smlalt z25.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x04a27610  // sqrdmulh z16.s, z16.s, z2.s\n"
+    ".inst 0x448847f7  // smlalt z23.s, p4/M, z31.h, z8.h\n"
+    ".inst 0x448a47fb  // smlalt z27.s, p4/M, z31.h, z10.h\n"
+    ".inst 0x04a276d6  // sqrdmulh z22.s, z22.s, z2.s\n"
+    "asr z29.s, z29.s, #0x1f\n"
+    "and z18.d, z17.d, z21.d\n"
+    ".inst 0x04af7739  // sqrdmulh z25.s, z25.s, z15.s\n"
+    "and z20.d, z16.d, z21.d\n"
+    ".inst 0x04af76f7  // sqrdmulh z23.s, z23.s, z15.s\n"
+    "and z19.d, z22.d, z21.d\n"
+    ".inst 0x04af777b  // sqrdmulh z27.s, z27.s, z15.s\n"
+    "sqadd z9.s, z9.s, z29.s\n"
+    ".inst 0x44829029  // srshl z9.s, p4/M, z9.s, z1.s\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "and z7.d, z25.d, z1.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "and z6.d, z23.d, z1.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "and z2.d, z27.d, z1.d\n"
+    "sqadd z17.s, z17.s, z18.s\n"
+    "asr z7.s, z7.s, #0x1f\n"
+    ".inst 0x448292b1  // srshl z17.s, p4/M, z17.s, z21.s\n"
+    "sqadd z16.s, z16.s, z20.s\n"
+    "asr z6.s, z6.s, #0x1f\n"
+    ".inst 0x448292b0  // srshl z16.s, p4/M, z16.s, z21.s\n"
+    "sqadd z22.s, z22.s, z19.s\n"
+    "asr z2.s, z2.s, #0x1f\n"
+    ".inst 0x448292b6  // srshl z22.s, p4/M, z22.s, z21.s\n"
+    "sqadd z25.s, z25.s, z7.s\n"
+    "sqadd z23.s, z23.s, z6.s\n"
+    ".inst 0x44829039  // srshl z25.s, p4/M, z25.s, z1.s\n"
+    ".inst 0x44829037  // srshl z23.s, p4/M, z23.s, z1.s\n"
+    "sqadd z27.s, z27.s, z2.s\n"
+    ".inst 0x453040a5  // sqxtnb z5.h, z5.s\n"
+    ".inst 0x4482903b  // srshl z27.s, p4/M, z27.s, z1.s\n"
+    ".inst 0x45304231  // sqxtnb z17.h, z17.s\n"
+    ".inst 0x45304210  // sqxtnb z16.h, z16.s\n"
+    ".inst 0x453042d6  // sqxtnb z22.h, z22.s\n"
+    ".inst 0x45304525  // sqxtnt z5.h, z9.s\n"
+    ".inst 0x45304731  // sqxtnt z17.h, z25.s\n"
+    ".inst 0x453046f0  // sqxtnt z16.h, z23.s\n"
+    ".inst 0x45304776  // sqxtnt z22.h, z27.s\n"
+    "sqadd z5.h, z5.h, z24.h\n"
+    "smax z5.h, p4/M, z5.h, z11.h\n"
+    "smin z5.h, p4/M, z5.h, z26.h\n"
+    "sqadd z17.h, z17.h, z24.h\n"
+    "sqadd z16.h, z16.h, z24.h\n"
+    "smax z17.h, p4/M, z17.h, z11.h\n"
+    "smax z16.h, p4/M, z16.h, z11.h\n"
+    "sqadd z22.h, z22.h, z24.h\n"
+    "smax z22.h, p4/M, z22.h, z11.h\n"
+    "smin z17.h, p4/M, z17.h, z26.h\n"
+    "st1b { z5.h }, p0, [x13, x27]\n"
+    "smin z16.h, p4/M, z16.h, z26.h\n"
+    "smin z22.h, p4/M, z22.h, z26.h\n"
+    "st1b { z17.h }, p0, [x12, x27]\n"
+    "st1b { z16.h }, p0, [x11, x27]\n"
+    "st1b { z22.h }, p0, [x10, x27]\n"
+    "ld1b { z14.h }, p4/Z, [x14]\n"
+    "ld1b { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+    "inch x27\n"
+    "ld1b { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+    "ld1b { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+    ".inst 0x455e19ce  // usublb z14.h, z14.b, z30.b\n"
+    ".inst 0x455e1ab5  // usublb z21.h, z21.b, z30.b\n"
+    "ld1b { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+    "ld1b { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+    ".inst 0x455e1821  // usublb z1.h, z1.b, z30.b\n"
+    ".inst 0x455e18c6  // usublb z6.h, z6.b, z30.b\n"
+    "ld1b { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+    "ld1b { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+    "inch x14, ALL, MUL #8\n"
+    ".inst 0x455e1842  // usublb z2.h, z2.b, z30.b\n"
+    "ld1w { z17.s }, p2/Z, [x21]\n"
+    "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+    "uzp1 z5.s, z17.s, z16.s\n"
+    "uzp2 z9.s, z17.s, z16.s\n"
+    "ld1b { z8.h }, p4/Z, [x14]\n"
+    "ldp x24, x23, [x28, #0x0]\n"
+    "addvl x21, x21, #2\n"
+    "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x22, x21, [x28, #0x10]\n"
+    "ldr x20, [x28, #0x20]\n"
+    "mov z17.d, z5.d\n"
+    "mov z25.d, z9.d\n"
+    "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+    "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+    "mov z16.d, z5.d\n"
+    "mov z23.d, z9.d\n"
+    "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+    "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+    "mov z22.d, z5.d\n"
+    "mov z27.d, z9.d\n"
+    "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x455e1a52  // usublb z18.h, z18.b, z30.b\n"
+    ".inst 0x455e18e7  // usublb z7.h, z7.b, z30.b\n"
+    ".inst 0x455e194a  // usublb z10.h, z10.b, z30.b\n"
+    ".inst 0x455e1908  // usublb z8.h, z8.b, z30.b\n"
+    ".inst 0x454c1800  // usublb z0.h, z0.b, z12.b\n"
+    ".inst 0x454c1bbd  // usublb z29.h, z29.b, z12.b\n"
+    ".inst 0x454c1884  // usublb z4.h, z4.b, z12.b\n"
+    ".inst 0x454c19ad  // usublb z13.h, z13.b, z12.b\n"
+    ".inst 0x454c1a94  // usublb z20.h, z20.b, z12.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..bcd0d60d3c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..1ea2fcbfbd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x7, #0x0\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+    "ptrue p4.b\n"
+    "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "mov x23, x7\n"
+    "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+    "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z26.b }, p4/Z, [x21]\n"
+    "ld1rb { z13.b }, p4/Z, [x20]\n"
+    "add x21, x25, %[offsetof_Requantize32_minval]\n"
+    "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z19.h }, p4/Z, [x22]\n"
+    "ld1rh { z12.h }, p4/Z, [x21]\n"
+    "ld1rh { z9.h }, p4/Z, [x20]\n"
+    "ldp x16, x15, [x24, #0x0]\n"
+    "incw x23\n"
+    "whilelt p3.h, x7, x8\n"
+    "ldp x14, x13, [x24, #0x10]\n"
+    "whilelt p2.s, x7, x8\n"
+    "whilelt p1.s, x23, x8\n"
+    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1b { z25.h }, p4/Z, [x17]\n"
+    "ld1b { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+    "add x11, %x[params], %[offsetof_Params_inptrs]\n"
+    "mov x10, #0x0\n"
+    "ld1b { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+    "ld1b { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+    ".inst 0x454d1b39  // usublb z25.h, z25.b, z13.b\n"
+    ".inst 0x454d1bde  // usublb z30.h, z30.b, z13.b\n"
+    "ld1b { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+    "ld1b { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+    ".inst 0x454d19ce  // usublb z14.h, z14.b, z13.b\n"
+    ".inst 0x454d1884  // usublb z4.h, z4.b, z13.b\n"
+    "ld1b { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+    "ld1b { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+    "inch x17, ALL, MUL #8\n"
+    ".inst 0x454d194a  // usublb z10.h, z10.b, z13.b\n"
+    "ld1w { z17.s }, p2/Z, [x12]\n"
+    "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
+    "uzp1 z8.s, z17.s, z16.s\n"
+    "uzp2 z24.s, z17.s, z16.s\n"
+    "ld1b { z2.h }, p4/Z, [x17]\n"
+    "ldp x27, x26, [x11, #0x0]\n"
+    "addvl x12, x12, #2\n"
+    "mov z18.d, z8.d\n"
+    "ldp x25, x24, [x11, #0x10]\n"
+    "ldp x23, x22, [x11, #0x20]\n"
+    "mov z0.d, z24.d\n"
+    "mov z15.d, z8.d\n"
+    "ldp x21, x20, [x11, #0x30]\n"
+    "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+    "mov z1.d, z24.d\n"
+    "mov z5.d, z8.d\n"
+    "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+    "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+    "mov z6.d, z24.d\n"
+    ".inst 0x454d1863  // usublb z3.h, z3.b, z13.b\n"
+    "ld1b { z20.h }, p3/Z, [x24, x7]\n"
+    "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x454d1af7  // usublb z23.h, z23.b, z13.b\n"
+    ".inst 0x454d18e7  // usublb z7.h, z7.b, z13.b\n"
+    "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+    "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x454d1842  // usublb z2.h, z2.b, z13.b\n"
+    ".inst 0x455a1ab5  // usublb z21.h, z21.b, z26.b\n"
+    "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+    "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    ".inst 0x455a1ad6  // usublb z22.h, z22.b, z26.b\n"
+    ".inst 0x455a196b  // usublb z11.h, z11.b, z26.b\n"
+    "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455a1a94  // usublb z20.h, z20.b, z26.b\n"
+    ".inst 0x455a1b7b  // usublb z27.h, z27.b, z26.b\n"
+    ".inst 0x455a1b9c  // usublb z28.h, z28.b, z26.b\n"
+    ".inst 0x455a1a10  // usublb z16.h, z16.b, z26.b\n"
+    ".inst 0x455a1bff  // usublb z31.h, z31.b, z26.b\n"
+    "1:"  // Loop
+    ".inst 0x448242a8  // smlalb z8.s, p4/M, z21.h, z2.h\n"
+    "ldr x21, [x11, #0x58]\n"
+    "ldr x20, [x11, #0x78]\n"
+    ".inst 0x448246b8  // smlalt z24.s, p4/M, z21.h, z2.h\n"
+    ".inst 0x449942c8  // smlalb z8.s, p4/M, z22.h, z25.h\n"
+    "ld1b { z17.h }, p3/Z, [x21, x7]\n"
+    "ld1b { z29.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x449742b2  // smlalb z18.s, p4/M, z21.h, z23.h\n"
+    "ldr x21, [x11, #0x60]\n"
+    "ldr x20, [x11, #0x80]\n"
+    ".inst 0x448e42af  // smlalb z15.s, p4/M, z21.h, z14.h\n"
+    ".inst 0x449942a5  // smlalb z5.s, p4/M, z21.h, z25.h\n"
+    ".inst 0x449946d8  // smlalt z24.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x455a1a31  // usublb z17.h, z17.b, z26.b\n"
+    ".inst 0x449e4168  // smlalb z8.s, p4/M, z11.h, z30.h\n"
+    "ld1b { z22.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x455a1bbd  // usublb z29.h, z29.b, z26.b\n"
+    ".inst 0x449746a0  // smlalt z0.s, p4/M, z21.h, z23.h\n"
+    ".inst 0x448e46a1  // smlalt z1.s, p4/M, z21.h, z14.h\n"
+    "ldr x21, [x11, #0x68]\n"
+    ".inst 0x449946a6  // smlalt z6.s, p4/M, z21.h, z25.h\n"
+    "ld1b { z21.h }, p3/Z, [x20, x7]\n"
+    "ldr x20, [x11, #0x88]\n"
+    ".inst 0x449e4292  // smlalb z18.s, p4/M, z20.h, z30.h\n"
+    ".inst 0x4484422f  // smlalb z15.s, p4/M, z17.h, z4.h\n"
+    ".inst 0x448a43a5  // smlalb z5.s, p4/M, z29.h, z10.h\n"
+    ".inst 0x455a1ad6  // usublb z22.h, z22.b, z26.b\n"
+    "ldr x22, [x11, #0x40]\n"
+    ".inst 0x449e4578  // smlalt z24.s, p4/M, z11.h, z30.h\n"
+    ".inst 0x455a1ab5  // usublb z21.h, z21.b, z26.b\n"
+    ".inst 0x44844388  // smlalb z8.s, p4/M, z28.h, z4.h\n"
+    "ld1b { z11.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x449e4680  // smlalt z0.s, p4/M, z20.h, z30.h\n"
+    "ld1b { z20.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x44844621  // smlalt z1.s, p4/M, z17.h, z4.h\n"
+    "ldr x21, [x11, #0x70]\n"
+    ".inst 0x448a47a6  // smlalt z6.s, p4/M, z29.h, z10.h\n"
+    "ldr x20, [x11, #0x98]\n"
+    ".inst 0x448e4372  // smlalb z18.s, p4/M, z27.h, z14.h\n"
+    "ldr x23, [x11, #0x50]\n"
+    ".inst 0x449942cf  // smlalb z15.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x449e42a5  // smlalb z5.s, p4/M, z21.h, z30.h\n"
+    ".inst 0x455a196b  // usublb z11.h, z11.b, z26.b\n"
+    "ld1b { z17.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x44844798  // smlalt z24.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x455a1a94  // usublb z20.h, z20.b, z26.b\n"
+    ".inst 0x448a4208  // smlalb z8.s, p4/M, z16.h, z10.h\n"
+    "ld1b { z29.h }, p3/Z, [x21, x7]\n"
+    "ld1b { z28.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x448e4760  // smlalt z0.s, p4/M, z27.h, z14.h\n"
+    "ldr x22, [x11, #0x48]\n"
+    ".inst 0x449946c1  // smlalt z1.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x449e46a6  // smlalt z6.s, p4/M, z21.h, z30.h\n"
+    "ldr x21, [x11, #0x90]\n"
+    "ldr x20, [x11, #0xa8]\n"
+    ".inst 0x449943f2  // smlalb z18.s, p4/M, z31.h, z25.h\n"
+    "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x448a416f  // smlalb z15.s, p4/M, z11.h, z10.h\n"
+    ".inst 0x44834285  // smlalb z5.s, p4/M, z20.h, z3.h\n"
+    ".inst 0x455a1a31  // usublb z17.h, z17.b, z26.b\n"
+    ".inst 0x448a4618  // smlalt z24.s, p4/M, z16.h, z10.h\n"
+    ".inst 0x455a1bbd  // usublb z29.h, z29.b, z26.b\n"
+    ".inst 0x448e43e8  // smlalb z8.s, p4/M, z31.h, z14.h\n"
+    "ld1b { z16.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x455a1b9c  // usublb z28.h, z28.b, z26.b\n"
+    ".inst 0x449947e0  // smlalt z0.s, p4/M, z31.h, z25.h\n"
+    "ld1b { z25.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x448a4561  // smlalt z1.s, p4/M, z11.h, z10.h\n"
+    "ld1b { z11.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x455a1b7b  // usublb z27.h, z27.b, z26.b\n"
+    ".inst 0x44834686  // smlalt z6.s, p4/M, z20.h, z3.h\n"
+    "ldr x21, [x11, #0xa0]\n"
+    "ldr x20, [x11, #0xb0]\n"
+    ".inst 0x448a4232  // smlalb z18.s, p4/M, z17.h, z10.h\n"
+    ".inst 0x449e43af  // smlalb z15.s, p4/M, z29.h, z30.h\n"
+    ".inst 0x455a1a10  // usublb z16.h, z16.b, z26.b\n"
+    ".inst 0x448e4385  // smlalb z5.s, p4/M, z28.h, z14.h\n"
+    ".inst 0x448e47f8  // smlalt z24.s, p4/M, z31.h, z14.h\n"
+    ".inst 0x455a1b39  // usublb z25.h, z25.b, z26.b\n"
+    "ld1b { z20.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x455a196b  // usublb z11.h, z11.b, z26.b\n"
+    ".inst 0x44834368  // smlalb z8.s, p4/M, z27.h, z3.h\n"
+    "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x448a4620  // smlalt z0.s, p4/M, z17.h, z10.h\n"
+    ".inst 0x449e47a1  // smlalt z1.s, p4/M, z29.h, z30.h\n"
+    ".inst 0x448e4786  // smlalt z6.s, p4/M, z28.h, z14.h\n"
+    "ldr x20, [x11, #0xb8]\n"
+    ".inst 0x455a1a94  // usublb z20.h, z20.b, z26.b\n"
+    ".inst 0x44834212  // smlalb z18.s, p4/M, z16.h, z3.h\n"
+    ".inst 0x4497432f  // smlalb z15.s, p4/M, z25.h, z23.h\n"
+    ".inst 0x455a1bff  // usublb z31.h, z31.b, z26.b\n"
+    "ld1b { z30.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x44844165  // smlalb z5.s, p4/M, z11.h, z4.h\n"
+    ".inst 0x44834778  // smlalt z24.s, p4/M, z27.h, z3.h\n"
+    "ldr x20, [x11, #0xc0]\n"
+    "ld1w { z17.s }, p2/Z, [x9]\n"
+    ".inst 0x449742c8  // smlalb z8.s, p4/M, z22.h, z23.h\n"
+    ".inst 0x44834600  // smlalt z0.s, p4/M, z16.h, z3.h\n"
+    "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
+    ".inst 0x455a1bde  // usublb z30.h, z30.b, z26.b\n"
+    ".inst 0x44974721  // smlalt z1.s, p4/M, z25.h, z23.h\n"
+    ".inst 0x44844566  // smlalt z6.s, p4/M, z11.h, z4.h\n"
+    "ld1b { z25.h }, p3/Z, [x20, x7]\n"
+    "uzp1 z10.s, z17.s, z14.s\n"
+    ".inst 0x44844372  // smlalb z18.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x4487428f  // smlalb z15.s, p4/M, z20.h, z7.h\n"
+    "uzp2 z14.s, z17.s, z14.s\n"
+    "ld1w { z17.s }, p2/Z, [x28]\n"
+    ".inst 0x448743e5  // smlalb z5.s, p4/M, z31.h, z7.h\n"
+    ".inst 0x449746d8  // smlalt z24.s, p4/M, z22.h, z23.h\n"
+    "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+    ".inst 0x455a1b39  // usublb z25.h, z25.b, z26.b\n"
+    ".inst 0x448743a8  // smlalb z8.s, p4/M, z29.h, z7.h\n"
+    ".inst 0x44844760  // smlalt z0.s, p4/M, z27.h, z4.h\n"
+    "uzp1 z4.s, z17.s, z16.s\n"
+    "inch x7\n"
+    ".inst 0x44874681  // smlalt z1.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x448747e6  // smlalt z6.s, p4/M, z31.h, z7.h\n"
+    ".inst 0x04aa7508  // sqrdmulh z8.s, z8.s, z10.s\n"
+    "whilelt p0.h, x10, x8\n"
+    ".inst 0x448742b2  // smlalb z18.s, p4/M, z21.h, z7.h\n"
+    ".inst 0x4483416f  // smlalb z15.s, p4/M, z11.h, z3.h\n"
+    "uzp2 z22.s, z17.s, z16.s\n"
+    "mov x20, x7\n"
+    ".inst 0x449743c5  // smlalb z5.s, p4/M, z30.h, z23.h\n"
+    ".inst 0x448747b8  // smlalt z24.s, p4/M, z29.h, z7.h\n"
+    "and z17.d, z8.d, z4.d\n"
+    "inch x17\n"
+    ".inst 0x448746a0  // smlalt z0.s, p4/M, z21.h, z7.h\n"
+    ".inst 0x44834561  // smlalt z1.s, p4/M, z11.h, z3.h\n"
+    ".inst 0x04ae7718  // sqrdmulh z24.s, z24.s, z14.s\n"
+    "incw x20\n"
+    ".inst 0x449747c6  // smlalt z6.s, p4/M, z30.h, z23.h\n"
+    ".inst 0x44824392  // smlalb z18.s, p4/M, z28.h, z2.h\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "whilelt p2.s, x7, x8\n"
+    ".inst 0x448243cf  // smlalb z15.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x44824325  // smlalb z5.s, p4/M, z25.h, z2.h\n"
+    "and z16.d, z24.d, z22.d\n"
+    "whilelt p1.s, x20, x8\n"
+    ".inst 0x44824780  // smlalt z0.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x448247c1  // smlalt z1.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x04aa7652  // sqrdmulh z18.s, z18.s, z10.s\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44824726  // smlalt z6.s, p4/M, z25.h, z2.h\n"
+    ".inst 0x04aa75ef  // sqrdmulh z15.s, z15.s, z10.s\n"
+    "whilelt p3.h, x7, x8\n"
+    "addvl x9, x9, #2\n"
+    ".inst 0x04aa74a5  // sqrdmulh z5.s, z5.s, z10.s\n"
+    "sqadd z8.s, z8.s, z17.s\n"
+    ".inst 0x44829088  // srshl z8.s, p4/M, z8.s, z4.s\n"
+    "addvl x28, x28, #2\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "and z21.d, z18.d, z4.d\n"
+    ".inst 0x04ae7400  // sqrdmulh z0.s, z0.s, z14.s\n"
+    "and z20.d, z15.d, z4.d\n"
+    ".inst 0x04ae7421  // sqrdmulh z1.s, z1.s, z14.s\n"
+    "and z28.d, z5.d, z4.d\n"
+    ".inst 0x04ae74c6  // sqrdmulh z6.s, z6.s, z14.s\n"
+    "sqadd z24.s, z24.s, z16.s\n"
+    ".inst 0x448292d8  // srshl z24.s, p4/M, z24.s, z22.s\n"
+    "asr z21.s, z21.s, #0x1f\n"
+    "and z25.d, z0.d, z22.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "and z17.d, z1.d, z22.d\n"
+    "asr z28.s, z28.s, #0x1f\n"
+    "and z16.d, z6.d, z22.d\n"
+    "sqadd z18.s, z18.s, z21.s\n"
+    "asr z25.s, z25.s, #0x1f\n"
+    ".inst 0x44829092  // srshl z18.s, p4/M, z18.s, z4.s\n"
+    "sqadd z15.s, z15.s, z20.s\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x4482908f  // srshl z15.s, p4/M, z15.s, z4.s\n"
+    "sqadd z5.s, z5.s, z28.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x44829085  // srshl z5.s, p4/M, z5.s, z4.s\n"
+    "sqadd z0.s, z0.s, z25.s\n"
+    "sqadd z1.s, z1.s, z17.s\n"
+    ".inst 0x448292c0  // srshl z0.s, p4/M, z0.s, z22.s\n"
+    ".inst 0x448292c1  // srshl z1.s, p4/M, z1.s, z22.s\n"
+    "sqadd z6.s, z6.s, z16.s\n"
+    ".inst 0x45304108  // sqxtnb z8.h, z8.s\n"
+    ".inst 0x448292c6  // srshl z6.s, p4/M, z6.s, z22.s\n"
+    ".inst 0x45304252  // sqxtnb z18.h, z18.s\n"
+    ".inst 0x453041ef  // sqxtnb z15.h, z15.s\n"
+    ".inst 0x453040a5  // sqxtnb z5.h, z5.s\n"
+    ".inst 0x45304708  // sqxtnt z8.h, z24.s\n"
+    ".inst 0x45304412  // sqxtnt z18.h, z0.s\n"
+    ".inst 0x4530442f  // sqxtnt z15.h, z1.s\n"
+    ".inst 0x453044c5  // sqxtnt z5.h, z6.s\n"
+    "sqadd z8.h, z8.h, z19.h\n"
+    "smax z8.h, p4/M, z8.h, z12.h\n"
+    "smin z8.h, p4/M, z8.h, z9.h\n"
+    "sqadd z18.h, z18.h, z19.h\n"
+    "sqadd z15.h, z15.h, z19.h\n"
+    "smax z18.h, p4/M, z18.h, z12.h\n"
+    "smax z15.h, p4/M, z15.h, z12.h\n"
+    "sqadd z5.h, z5.h, z19.h\n"
+    "smax z5.h, p4/M, z5.h, z12.h\n"
+    "smin z18.h, p4/M, z18.h, z9.h\n"
+    "st1b { z8.h }, p0, [x16, x10]\n"
+    "smin z15.h, p4/M, z15.h, z9.h\n"
+    "smin z5.h, p4/M, z5.h, z9.h\n"
+    "st1b { z18.h }, p0, [x15, x10]\n"
+    "st1b { z15.h }, p0, [x14, x10]\n"
+    "st1b { z5.h }, p0, [x13, x10]\n"
+    "ld1b { z25.h }, p4/Z, [x17]\n"
+    "ld1b { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+    "inch x10\n"
+    "ld1b { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+    "ld1b { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+    ".inst 0x454d1b39  // usublb z25.h, z25.b, z13.b\n"
+    ".inst 0x454d1bde  // usublb z30.h, z30.b, z13.b\n"
+    "ld1b { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+    "ld1b { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+    ".inst 0x454d19ce  // usublb z14.h, z14.b, z13.b\n"
+    ".inst 0x454d1884  // usublb z4.h, z4.b, z13.b\n"
+    "ld1b { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+    "ld1b { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+    "inch x17, ALL, MUL #8\n"
+    ".inst 0x454d194a  // usublb z10.h, z10.b, z13.b\n"
+    "ld1w { z17.s }, p2/Z, [x20]\n"
+    "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+    "uzp1 z8.s, z17.s, z16.s\n"
+    "uzp2 z24.s, z17.s, z16.s\n"
+    "ld1b { z2.h }, p4/Z, [x17]\n"
+    "ldp x27, x26, [x11, #0x0]\n"
+    "addvl x20, x20, #2\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x25, x24, [x11, #0x10]\n"
+    "ldp x23, x22, [x11, #0x20]\n"
+    "mov z18.d, z8.d\n"
+    "mov z0.d, z24.d\n"
+    "ldp x21, x20, [x11, #0x30]\n"
+    "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+    "mov z15.d, z8.d\n"
+    "mov z1.d, z24.d\n"
+    "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+    "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+    "mov z5.d, z8.d\n"
+    "mov z6.d, z24.d\n"
+    "ld1b { z20.h }, p3/Z, [x24, x7]\n"
+    "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x454d1863  // usublb z3.h, z3.b, z13.b\n"
+    ".inst 0x454d1af7  // usublb z23.h, z23.b, z13.b\n"
+    "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+    "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x454d18e7  // usublb z7.h, z7.b, z13.b\n"
+    ".inst 0x454d1842  // usublb z2.h, z2.b, z13.b\n"
+    "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x455a1ab5  // usublb z21.h, z21.b, z26.b\n"
+    ".inst 0x455a1ad6  // usublb z22.h, z22.b, z26.b\n"
+    ".inst 0x455a196b  // usublb z11.h, z11.b, z26.b\n"
+    ".inst 0x455a1a94  // usublb z20.h, z20.b, z26.b\n"
+    ".inst 0x455a1b7b  // usublb z27.h, z27.b, z26.b\n"
+    ".inst 0x455a1b9c  // usublb z28.h, z28.b, z26.b\n"
+    ".inst 0x455a1a10  // usublb z16.h, z16.b, z26.b\n"
+    ".inst 0x455a1bff  // usublb z31.h, z31.b, z26.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..dfaa059e9f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..b8adbb8262
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x2, #0x0\n"
+    "mov x24, x2\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ptrue p4.b\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "incw x24\n"
+    "ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x21, x23, %[offsetof_Requantize32_a_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+    "ld1rb { z30.b }, p4/Z, [x21]\n"
+    "ld1rb { z10.b }, p4/Z, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_minval]\n"
+    "ld1rh { z15.h }, p4/Z, [x21]\n"
+    "ld1rh { z12.h }, p4/Z, [x20]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z13.h }, p4/Z, [x20]\n"
+    "ldp x5, x6, [x22, #0x0]\n"
+    "whilelt p3.h, x2, x3\n"
+    "ldp x7, x8, [x22, #0x10]\n"
+    "whilelt p2.s, x2, x3\n"
+    "whilelt p1.s, x24, x3\n"
+    "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
+    "add x17, %x[params], %[offsetof_Params_inptrs]\n"
+    "ld1w { z17.s }, p2/Z, [x10]\n"
+    "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+    "uzp1 z14.s, z17.s, z16.s\n"
+    "ld1b { z26.h }, p4/Z, [x4]\n"
+    "ld1b { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+    "uzp2 z23.s, z17.s, z16.s\n"
+    "addvl x10, x10, #2\n"
+    "ld1b { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+    "ld1b { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+    "mov x16, #0x0\n"
+    "mov z6.d, z14.d\n"
+    "ld1b { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+    "ldp x9, x28, [x17, #0x0]\n"
+    "mov z18.d, z23.d\n"
+    "mov z9.d, z14.d\n"
+    "ldp x27, x26, [x17, #0x10]\n"
+    "ldp x25, x24, [x17, #0x20]\n"
+    "mov z20.d, z23.d\n"
+    "mov z7.d, z14.d\n"
+    "ldp x23, x22, [x17, #0x30]\n"
+    "ldp x21, x20, [x17, #0x40]\n"
+    "mov z1.d, z23.d\n"
+    ".inst 0x454a1b5a  // usublb z26.h, z26.b, z10.b\n"
+    "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+    "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x454a1908  // usublb z8.h, z8.b, z10.b\n"
+    ".inst 0x454a1a10  // usublb z16.h, z16.b, z10.b\n"
+    "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+    "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x454a1ab5  // usublb z21.h, z21.b, z10.b\n"
+    ".inst 0x454a1a31  // usublb z17.h, z17.b, z10.b\n"
+    "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+    "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x455e1ad6  // usublb z22.h, z22.b, z30.b\n"
+    ".inst 0x455e1842  // usublb z2.h, z2.b, z30.b\n"
+    "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+    "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e196b  // usublb z11.h, z11.b, z30.b\n"
+    ".inst 0x455e1863  // usublb z3.h, z3.b, z30.b\n"
+    "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+    "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1bbd  // usublb z29.h, z29.b, z30.b\n"
+    ".inst 0x455e1884  // usublb z4.h, z4.b, z30.b\n"
+    "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x10, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455e1bff  // usublb z31.h, z31.b, z30.b\n"
+    ".inst 0x455e1800  // usublb z0.h, z0.b, z30.b\n"
+    ".inst 0x455e1a73  // usublb z19.h, z19.b, z30.b\n"
+    ".inst 0x455e1b9c  // usublb z28.h, z28.b, z30.b\n"
+    "1:"  // Loop
+    ".inst 0x449a42ce  // smlalb z14.s, p4/M, z22.h, z26.h\n"
+    ".inst 0x449a46d7  // smlalt z23.s, p4/M, z22.h, z26.h\n"
+    "ldr x20, [x17, #0x50]\n"
+    "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x4488404e  // smlalb z14.s, p4/M, z2.h, z8.h\n"
+    ".inst 0x449a4046  // smlalb z6.s, p4/M, z2.h, z26.h\n"
+    "ldr x20, [x17, #0x58]\n"
+    ".inst 0x455e1b7b  // usublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449a4169  // smlalb z9.s, p4/M, z11.h, z26.h\n"
+    ".inst 0x449a4067  // smlalb z7.s, p4/M, z3.h, z26.h\n"
+    "ld1b { z5.h }, p3/Z, [x20, x2]\n"
+    "ldr x20, [x17, #0x60]\n"
+    ".inst 0x44884457  // smlalt z23.s, p4/M, z2.h, z8.h\n"
+    ".inst 0x449043ae  // smlalb z14.s, p4/M, z29.h, z16.h\n"
+    "ld1b { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x455e18a5  // usublb z5.h, z5.b, z30.b\n"
+    ".inst 0x449a4452  // smlalt z18.s, p4/M, z2.h, z26.h\n"
+    ".inst 0x449a4574  // smlalt z20.s, p4/M, z11.h, z26.h\n"
+    "ld1b { z22.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x454a1b39  // usublb z25.h, z25.b, z10.b\n"
+    ".inst 0x449a4461  // smlalt z1.s, p4/M, z3.h, z26.h\n"
+    ".inst 0x448843a6  // smlalb z6.s, p4/M, z29.h, z8.h\n"
+    "ldr x20, [x17, #0x68]\n"
+    "ld1b { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x44884069  // smlalb z9.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x44884087  // smlalb z7.s, p4/M, z4.h, z8.h\n"
+    ".inst 0x455e1ad6  // usublb z22.h, z22.b, z30.b\n"
+    "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x449047b7  // smlalt z23.s, p4/M, z29.h, z16.h\n"
+    ".inst 0x449543ee  // smlalb z14.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454a1842  // usublb z2.h, z2.b, z10.b\n"
+    "ldr x20, [x17, #0x70]\n"
+    ".inst 0x448847b2  // smlalt z18.s, p4/M, z29.h, z8.h\n"
+    ".inst 0x44884474  // smlalt z20.s, p4/M, z3.h, z8.h\n"
+    "ld1b { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
+    ".inst 0x455e1b5a  // usublb z26.h, z26.b, z30.b\n"
+    ".inst 0x44884481  // smlalt z1.s, p4/M, z4.h, z8.h\n"
+    ".inst 0x449043e6  // smlalb z6.s, p4/M, z31.h, z16.h\n"
+    "inch x4, ALL, MUL #8\n"
+    "ld1b { z8.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x44904089  // smlalb z9.s, p4/M, z4.h, z16.h\n"
+    ".inst 0x44904367  // smlalb z7.s, p4/M, z27.h, z16.h\n"
+    ".inst 0x454a1bbd  // usublb z29.h, z29.b, z10.b\n"
+    "ldr x20, [x17, #0x78]\n"
+    ".inst 0x449547f7  // smlalt z23.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x4491400e  // smlalb z14.s, p4/M, z0.h, z17.h\n"
+    "ld1b { z24.h }, p4/Z, [x4]\n"
+    ".inst 0x455e1908  // usublb z8.h, z8.b, z30.b\n"
+    ".inst 0x449047f2  // smlalt z18.s, p4/M, z31.h, z16.h\n"
+    ".inst 0x44904494  // smlalt z20.s, p4/M, z4.h, z16.h\n"
+    "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x454a1b18  // usublb z24.h, z24.b, z10.b\n"
+    ".inst 0x44904761  // smlalt z1.s, p4/M, z27.h, z16.h\n"
+    ".inst 0x44954006  // smlalb z6.s, p4/M, z0.h, z21.h\n"
+    "ldr x22, [x17, #0x80]\n"
+    "ld1b { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
+    ".inst 0x44954369  // smlalb z9.s, p4/M, z27.h, z21.h\n"
+    ".inst 0x449540a7  // smlalb z7.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x455e1bff  // usublb z31.h, z31.b, z30.b\n"
+    "ldr x21, [x17, #0x88]\n"
+    ".inst 0x44914417  // smlalt z23.s, p4/M, z0.h, z17.h\n"
+    ".inst 0x4499416e  // smlalb z14.s, p4/M, z11.h, z25.h\n"
+    ".inst 0x454a1a10  // usublb z16.h, z16.b, z10.b\n"
+    "ldr x20, [x17, #0x90]\n"
+    ".inst 0x44954412  // smlalt z18.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x44954774  // smlalt z20.s, p4/M, z27.h, z21.h\n"
+    "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1800  // usublb z0.h, z0.b, z30.b\n"
+    ".inst 0x449544a1  // smlalt z1.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x449142c6  // smlalb z6.s, p4/M, z22.h, z17.h\n"
+    "ld1b { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
+    ".inst 0x454a1ab5  // usublb z21.h, z21.b, z10.b\n"
+    ".inst 0x449140a9  // smlalb z9.s, p4/M, z5.h, z17.h\n"
+    ".inst 0x44914267  // smlalb z7.s, p4/M, z19.h, z17.h\n"
+    "ldr x23, [x17, #0x98]\n"
+    "ldr x22, [x17, #0xa0]\n"
+    ".inst 0x44994577  // smlalt z23.s, p4/M, z11.h, z25.h\n"
+    ".inst 0x4482406e  // smlalb z14.s, p4/M, z3.h, z2.h\n"
+    "ld1b { z11.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x455e196b  // usublb z11.h, z11.b, z30.b\n"
+    ".inst 0x449146d2  // smlalt z18.s, p4/M, z22.h, z17.h\n"
+    ".inst 0x449144b4  // smlalt z20.s, p4/M, z5.h, z17.h\n"
+    "ld1b { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
+    ".inst 0x454a1ad6  // usublb z22.h, z22.b, z10.b\n"
+    ".inst 0x44914661  // smlalt z1.s, p4/M, z19.h, z17.h\n"
+    ".inst 0x44994066  // smlalb z6.s, p4/M, z3.h, z25.h\n"
+    "ld1b { z17.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1a31  // usublb z17.h, z17.b, z30.b\n"
+    ".inst 0x44994389  // smlalb z9.s, p4/M, z28.h, z25.h\n"
+    ".inst 0x44994347  // smlalb z7.s, p4/M, z26.h, z25.h\n"
+    "ldr x20, [x17, #0xa8]\n"
+    "ldr x21, [x17, #0xb0]\n"
+    ".inst 0x44824477  // smlalt z23.s, p4/M, z3.h, z2.h\n"
+    ".inst 0x449d408e  // smlalb z14.s, p4/M, z4.h, z29.h\n"
+    "ldr x13, [x17, #0xb8]\n"
+    "ldr x12, [x17, #0xc0]\n"
+    ".inst 0x44994472  // smlalt z18.s, p4/M, z3.h, z25.h\n"
+    ".inst 0x44994794  // smlalt z20.s, p4/M, z28.h, z25.h\n"
+    "ld1b { z3.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x455e1863  // usublb z3.h, z3.b, z30.b\n"
+    ".inst 0x44994741  // smlalt z1.s, p4/M, z26.h, z25.h\n"
+    ".inst 0x44824086  // smlalb z6.s, p4/M, z4.h, z2.h\n"
+    "ld1b { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
+    ".inst 0x454a1b39  // usublb z25.h, z25.b, z10.b\n"
+    ".inst 0x44824349  // smlalb z9.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x44824107  // smlalb z7.s, p4/M, z8.h, z2.h\n"
+    "ldr x11, [x17, #0xc8]\n"
+    "ldr x10, [x17, #0xd0]\n"
+    ".inst 0x449d4497  // smlalt z23.s, p4/M, z4.h, z29.h\n"
+    ".inst 0x4498436e  // smlalb z14.s, p4/M, z27.h, z24.h\n"
+    "ldr x9, [x17, #0xd8]\n"
+    "ldr x28, [x17, #0xe0]\n"
+    ".inst 0x44824492  // smlalt z18.s, p4/M, z4.h, z2.h\n"
+    ".inst 0x44824754  // smlalt z20.s, p4/M, z26.h, z2.h\n"
+    "ld1b { z4.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1884  // usublb z4.h, z4.b, z30.b\n"
+    ".inst 0x44824501  // smlalt z1.s, p4/M, z8.h, z2.h\n"
+    ".inst 0x449d4366  // smlalb z6.s, p4/M, z27.h, z29.h\n"
+    "ld1b { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x454a1842  // usublb z2.h, z2.b, z10.b\n"
+    ".inst 0x449d4109  // smlalb z9.s, p4/M, z8.h, z29.h\n"
+    ".inst 0x449d43e7  // smlalb z7.s, p4/M, z31.h, z29.h\n"
+    "ldr x27, [x17, #0xe8]\n"
+    "ldr x26, [x17, #0xf0]\n"
+    ".inst 0x44984777  // smlalt z23.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x449040ae  // smlalb z14.s, p4/M, z5.h, z16.h\n"
+    "ldr x25, [x17, #0xf8]\n"
+    "ldr x24, [x17, #0x100]\n"
+    ".inst 0x449d4772  // smlalt z18.s, p4/M, z27.h, z29.h\n"
+    ".inst 0x449d4514  // smlalt z20.s, p4/M, z8.h, z29.h\n"
+    "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1b7b  // usublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449d47e1  // smlalt z1.s, p4/M, z31.h, z29.h\n"
+    ".inst 0x449840a6  // smlalb z6.s, p4/M, z5.h, z24.h\n"
+    "ld1b { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x454a1bbd  // usublb z29.h, z29.b, z10.b\n"
+    ".inst 0x449843e9  // smlalb z9.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984007  // smlalb z7.s, p4/M, z0.h, z24.h\n"
+    "ldr x23, [x17, #0x108]\n"
+    "ldr x22, [x17, #0x110]\n"
+    ".inst 0x449044b7  // smlalt z23.s, p4/M, z5.h, z16.h\n"
+    ".inst 0x4495438e  // smlalb z14.s, p4/M, z28.h, z21.h\n"
+    "ldr x20, [x17, #0x118]\n"
+    "whilelt p0.h, x16, x3\n"
+    ".inst 0x449844b2  // smlalt z18.s, p4/M, z5.h, z24.h\n"
+    ".inst 0x449847f4  // smlalt z20.s, p4/M, z31.h, z24.h\n"
+    "ld1b { z5.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x455e18a5  // usublb z5.h, z5.b, z30.b\n"
+    ".inst 0x44984401  // smlalt z1.s, p4/M, z0.h, z24.h\n"
+    ".inst 0x44904266  // smlalb z6.s, p4/M, z19.h, z16.h\n"
+    "ld1b { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
+    "inch x4, ALL, MUL #8\n"
+    ".inst 0x44904009  // smlalb z9.s, p4/M, z0.h, z16.h\n"
+    ".inst 0x44904167  // smlalb z7.s, p4/M, z11.h, z16.h\n"
+    ".inst 0x454a1b18  // usublb z24.h, z24.b, z10.b\n"
+    "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44954797  // smlalt z23.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x4496434e  // smlalb z14.s, p4/M, z26.h, z22.h\n"
+    "ld1b { z28.h }, p3/Z, [x13, x2]\n"
+    ".inst 0x455e1b9c  // usublb z28.h, z28.b, z30.b\n"
+    ".inst 0x44904672  // smlalt z18.s, p4/M, z19.h, z16.h\n"
+    ".inst 0x44904414  // smlalt z20.s, p4/M, z0.h, z16.h\n"
+    "ld1b { z19.h }, p4/Z, [x4]\n"
+    ".inst 0x454a1a73  // usublb z19.h, z19.b, z10.b\n"
+    ".inst 0x44904561  // smlalt z1.s, p4/M, z11.h, z16.h\n"
+    ".inst 0x44954346  // smlalb z6.s, p4/M, z26.h, z21.h\n"
+    "ld1b { z16.h }, p3/Z, [x12, x2]\n"
+    ".inst 0x455e1a10  // usublb z16.h, z16.b, z30.b\n"
+    ".inst 0x44954229  // smlalb z9.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x44954067  // smlalb z7.s, p4/M, z3.h, z21.h\n"
+    ".inst 0x44964757  // smlalt z23.s, p4/M, z26.h, z22.h\n"
+    ".inst 0x4499410e  // smlalb z14.s, p4/M, z8.h, z25.h\n"
+    ".inst 0x44954752  // smlalt z18.s, p4/M, z26.h, z21.h\n"
+    ".inst 0x44954634  // smlalt z20.s, p4/M, z17.h, z21.h\n"
+    "ld1b { z26.h }, p3/Z, [x11, x2]\n"
+    ".inst 0x455e1b5a  // usublb z26.h, z26.b, z30.b\n"
+    ".inst 0x44954461  // smlalt z1.s, p4/M, z3.h, z21.h\n"
+    ".inst 0x44964106  // smlalb z6.s, p4/M, z8.h, z22.h\n"
+    "ld1b { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
+    ".inst 0x454a1ab5  // usublb z21.h, z21.b, z10.b\n"
+    ".inst 0x44964069  // smlalb z9.s, p4/M, z3.h, z22.h\n"
+    ".inst 0x44964087  // smlalb z7.s, p4/M, z4.h, z22.h\n"
+    ".inst 0x44994517  // smlalt z23.s, p4/M, z8.h, z25.h\n"
+    ".inst 0x448243ee  // smlalb z14.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x44964512  // smlalt z18.s, p4/M, z8.h, z22.h\n"
+    ".inst 0x44964474  // smlalt z20.s, p4/M, z3.h, z22.h\n"
+    "ld1b { z8.h }, p3/Z, [x10, x2]\n"
+    ".inst 0x455e1908  // usublb z8.h, z8.b, z30.b\n"
+    ".inst 0x44964481  // smlalt z1.s, p4/M, z4.h, z22.h\n"
+    ".inst 0x449943e6  // smlalb z6.s, p4/M, z31.h, z25.h\n"
+    "ld1b { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
+    ".inst 0x454a1ad6  // usublb z22.h, z22.b, z10.b\n"
+    ".inst 0x44994089  // smlalb z9.s, p4/M, z4.h, z25.h\n"
+    ".inst 0x44994367  // smlalb z7.s, p4/M, z27.h, z25.h\n"
+    ".inst 0x448247f7  // smlalt z23.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x449d400e  // smlalb z14.s, p4/M, z0.h, z29.h\n"
+    ".inst 0x449947f2  // smlalt z18.s, p4/M, z31.h, z25.h\n"
+    ".inst 0x44994494  // smlalt z20.s, p4/M, z4.h, z25.h\n"
+    "ld1b { z31.h }, p3/Z, [x9, x2]\n"
+    ".inst 0x455e1bff  // usublb z31.h, z31.b, z30.b\n"
+    ".inst 0x44994761  // smlalt z1.s, p4/M, z27.h, z25.h\n"
+    ".inst 0x44824006  // smlalb z6.s, p4/M, z0.h, z2.h\n"
+    "ld1b { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
+    ".inst 0x454a1b39  // usublb z25.h, z25.b, z10.b\n"
+    ".inst 0x44824369  // smlalb z9.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x448240a7  // smlalb z7.s, p4/M, z5.h, z2.h\n"
+    ".inst 0x449d4417  // smlalt z23.s, p4/M, z0.h, z29.h\n"
+    ".inst 0x4498422e  // smlalb z14.s, p4/M, z17.h, z24.h\n"
+    ".inst 0x44824412  // smlalt z18.s, p4/M, z0.h, z2.h\n"
+    ".inst 0x44824774  // smlalt z20.s, p4/M, z27.h, z2.h\n"
+    "ld1b { z0.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x455e1800  // usublb z0.h, z0.b, z30.b\n"
+    ".inst 0x448244a1  // smlalt z1.s, p4/M, z5.h, z2.h\n"
+    ".inst 0x449d4166  // smlalb z6.s, p4/M, z11.h, z29.h\n"
+    "ld1b { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
+    ".inst 0x454a1842  // usublb z2.h, z2.b, z10.b\n"
+    ".inst 0x449d40a9  // smlalb z9.s, p4/M, z5.h, z29.h\n"
+    ".inst 0x449d4387  // smlalb z7.s, p4/M, z28.h, z29.h\n"
+    ".inst 0x44984637  // smlalt z23.s, p4/M, z17.h, z24.h\n"
+    ".inst 0x4493406e  // smlalb z14.s, p4/M, z3.h, z19.h\n"
+    "ld1b { z17.h }, p3/Z, [x27, x2]\n"
+    ".inst 0x455e1a31  // usublb z17.h, z17.b, z30.b\n"
+    ".inst 0x449d4572  // smlalt z18.s, p4/M, z11.h, z29.h\n"
+    ".inst 0x449d44b4  // smlalt z20.s, p4/M, z5.h, z29.h\n"
+    "ld1b { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x454a196b  // usublb z11.h, z11.b, z10.b\n"
+    ".inst 0x449d4781  // smlalt z1.s, p4/M, z28.h, z29.h\n"
+    ".inst 0x44984066  // smlalb z6.s, p4/M, z3.h, z24.h\n"
+    "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x455e1bbd  // usublb z29.h, z29.b, z30.b\n"
+    ".inst 0x44984209  // smlalb z9.s, p4/M, z16.h, z24.h\n"
+    ".inst 0x44984347  // smlalb z7.s, p4/M, z26.h, z24.h\n"
+    ".inst 0x44934477  // smlalt z23.s, p4/M, z3.h, z19.h\n"
+    ".inst 0x4495408e  // smlalb z14.s, p4/M, z4.h, z21.h\n"
+    ".inst 0x44984472  // smlalt z18.s, p4/M, z3.h, z24.h\n"
+    ".inst 0x44984614  // smlalt z20.s, p4/M, z16.h, z24.h\n"
+    "ld1b { z3.h }, p3/Z, [x25, x2]\n"
+    ".inst 0x455e1863  // usublb z3.h, z3.b, z30.b\n"
+    ".inst 0x44984741  // smlalt z1.s, p4/M, z26.h, z24.h\n"
+    ".inst 0x44934086  // smlalb z6.s, p4/M, z4.h, z19.h\n"
+    "ld1b { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x454a1b18  // usublb z24.h, z24.b, z10.b\n"
+    ".inst 0x44934349  // smlalb z9.s, p4/M, z26.h, z19.h\n"
+    ".inst 0x44934107  // smlalb z7.s, p4/M, z8.h, z19.h\n"
+    ".inst 0x44954497  // smlalt z23.s, p4/M, z4.h, z21.h\n"
+    ".inst 0x4496436e  // smlalb z14.s, p4/M, z27.h, z22.h\n"
+    ".inst 0x44934492  // smlalt z18.s, p4/M, z4.h, z19.h\n"
+    ".inst 0x44934754  // smlalt z20.s, p4/M, z26.h, z19.h\n"
+    "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x455e1884  // usublb z4.h, z4.b, z30.b\n"
+    ".inst 0x44934501  // smlalt z1.s, p4/M, z8.h, z19.h\n"
+    ".inst 0x44954366  // smlalb z6.s, p4/M, z27.h, z21.h\n"
+    "ld1b { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+    "inch x4, ALL, MUL #8\n"
+    ".inst 0x44954109  // smlalb z9.s, p4/M, z8.h, z21.h\n"
+    ".inst 0x449543e7  // smlalb z7.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454a1a73  // usublb z19.h, z19.b, z10.b\n"
+    ".inst 0x44964777  // smlalt z23.s, p4/M, z27.h, z22.h\n"
+    ".inst 0x449940ae  // smlalb z14.s, p4/M, z5.h, z25.h\n"
+    ".inst 0x44954772  // smlalt z18.s, p4/M, z27.h, z21.h\n"
+    ".inst 0x44954514  // smlalt z20.s, p4/M, z8.h, z21.h\n"
+    "ld1b { z27.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x455e1b7b  // usublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449547e1  // smlalt z1.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x449640a6  // smlalb z6.s, p4/M, z5.h, z22.h\n"
+    "ld1b { z21.h }, p4/Z, [x4]\n"
+    ".inst 0x454a1ab5  // usublb z21.h, z21.b, z10.b\n"
+    ".inst 0x449643e9  // smlalb z9.s, p4/M, z31.h, z22.h\n"
+    ".inst 0x44964007  // smlalb z7.s, p4/M, z0.h, z22.h\n"
+    "inch x4\n"
+    ".inst 0x449944b7  // smlalt z23.s, p4/M, z5.h, z25.h\n"
+    ".inst 0x4482420e  // smlalb z14.s, p4/M, z16.h, z2.h\n"
+    ".inst 0x449644b2  // smlalt z18.s, p4/M, z5.h, z22.h\n"
+    ".inst 0x449647f4  // smlalt z20.s, p4/M, z31.h, z22.h\n"
+    "ld1b { z5.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e18a5  // usublb z5.h, z5.b, z30.b\n"
+    ".inst 0x44964401  // smlalt z1.s, p4/M, z0.h, z22.h\n"
+    ".inst 0x44994386  // smlalb z6.s, p4/M, z28.h, z25.h\n"
+    "ld1w { z22.s }, p2/Z, [x15]\n"
+    ".inst 0x44994009  // smlalb z9.s, p4/M, z0.h, z25.h\n"
+    ".inst 0x44994227  // smlalb z7.s, p4/M, z17.h, z25.h\n"
+    ".inst 0x44824617  // smlalt z23.s, p4/M, z16.h, z2.h\n"
+    ".inst 0x448b434e  // smlalb z14.s, p4/M, z26.h, z11.h\n"
+    "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
+    "addvl x15, x15, #2\n"
+    ".inst 0x44994792  // smlalt z18.s, p4/M, z28.h, z25.h\n"
+    ".inst 0x44994414  // smlalt z20.s, p4/M, z0.h, z25.h\n"
+    "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1b9c  // usublb z28.h, z28.b, z30.b\n"
+    ".inst 0x44994621  // smlalt z1.s, p4/M, z17.h, z25.h\n"
+    ".inst 0x44824346  // smlalb z6.s, p4/M, z26.h, z2.h\n"
+    "uzp1 z25.s, z22.s, z16.s\n"
+    "inch x2\n"
+    ".inst 0x448243a9  // smlalb z9.s, p4/M, z29.h, z2.h\n"
+    ".inst 0x44824067  // smlalb z7.s, p4/M, z3.h, z2.h\n"
+    "uzp2 z16.s, z22.s, z16.s\n"
+    "ld1w { z22.s }, p2/Z, [x14]\n"
+    ".inst 0x448b4757  // smlalt z23.s, p4/M, z26.h, z11.h\n"
+    ".inst 0x4498410e  // smlalb z14.s, p4/M, z8.h, z24.h\n"
+    "mov x20, x2\n"
+    "incw x20\n"
+    ".inst 0x44824752  // smlalt z18.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x448247b4  // smlalt z20.s, p4/M, z29.h, z2.h\n"
+    "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
+    "uzp1 z29.s, z22.s, z26.s\n"
+    ".inst 0x44824461  // smlalt z1.s, p4/M, z3.h, z2.h\n"
+    ".inst 0x448b4106  // smlalb z6.s, p4/M, z8.h, z11.h\n"
+    "uzp2 z22.s, z22.s, z26.s\n"
+    "whilelt p2.s, x2, x3\n"
+    ".inst 0x448b4069  // smlalb z9.s, p4/M, z3.h, z11.h\n"
+    ".inst 0x448b4087  // smlalb z7.s, p4/M, z4.h, z11.h\n"
+    "whilelt p1.s, x20, x3\n"
+    "whilelt p3.h, x2, x3\n"
+    ".inst 0x44984517  // smlalt z23.s, p4/M, z8.h, z24.h\n"
+    ".inst 0x449343ee  // smlalb z14.s, p4/M, z31.h, z19.h\n"
+    "addvl x14, x14, #2\n"
+    ".inst 0x448b4512  // smlalt z18.s, p4/M, z8.h, z11.h\n"
+    ".inst 0x448b4474  // smlalt z20.s, p4/M, z3.h, z11.h\n"
+    ".inst 0x448b4481  // smlalt z1.s, p4/M, z4.h, z11.h\n"
+    ".inst 0x449843e6  // smlalb z6.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984089  // smlalb z9.s, p4/M, z4.h, z24.h\n"
+    ".inst 0x44984367  // smlalb z7.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x449347f7  // smlalt z23.s, p4/M, z31.h, z19.h\n"
+    ".inst 0x4495400e  // smlalb z14.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x04b975ce  // sqrdmulh z14.s, z14.s, z25.s\n"
+    ".inst 0x449847f2  // smlalt z18.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984494  // smlalt z20.s, p4/M, z4.h, z24.h\n"
+    "and z3.d, z14.d, z29.d\n"
+    ".inst 0x44984761  // smlalt z1.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x44934006  // smlalb z6.s, p4/M, z0.h, z19.h\n"
+    "asr z3.s, z3.s, #0x1f\n"
+    ".inst 0x44934369  // smlalb z9.s, p4/M, z27.h, z19.h\n"
+    ".inst 0x449340a7  // smlalb z7.s, p4/M, z5.h, z19.h\n"
+    "sqadd z14.s, z14.s, z3.s\n"
+    ".inst 0x448293ae  // srshl z14.s, p4/M, z14.s, z29.s\n"
+    ".inst 0x44954417  // smlalt z23.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x44934412  // smlalt z18.s, p4/M, z0.h, z19.h\n"
+    ".inst 0x04b076f7  // sqrdmulh z23.s, z23.s, z16.s\n"
+    ".inst 0x44934774  // smlalt z20.s, p4/M, z27.h, z19.h\n"
+    ".inst 0x449344a1  // smlalt z1.s, p4/M, z5.h, z19.h\n"
+    "and z31.d, z23.d, z22.d\n"
+    ".inst 0x44954226  // smlalb z6.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x449540a9  // smlalb z9.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x04b974c6  // sqrdmulh z6.s, z6.s, z25.s\n"
+    ".inst 0x44954387  // smlalb z7.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x44954632  // smlalt z18.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x04b97529  // sqrdmulh z9.s, z9.s, z25.s\n"
+    ".inst 0x449544b4  // smlalt z20.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x44954781  // smlalt z1.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x04b974e7  // sqrdmulh z7.s, z7.s, z25.s\n"
+    "asr z31.s, z31.s, #0x1f\n"
+    "and z3.d, z6.d, z29.d\n"
+    ".inst 0x04b07652  // sqrdmulh z18.s, z18.s, z16.s\n"
+    "and z0.d, z9.d, z29.d\n"
+    ".inst 0x04b07694  // sqrdmulh z20.s, z20.s, z16.s\n"
+    "and z19.d, z7.d, z29.d\n"
+    ".inst 0x04b07421  // sqrdmulh z1.s, z1.s, z16.s\n"
+    "sqadd z23.s, z23.s, z31.s\n"
+    ".inst 0x448292d7  // srshl z23.s, p4/M, z23.s, z22.s\n"
+    "asr z3.s, z3.s, #0x1f\n"
+    "and z21.d, z18.d, z22.d\n"
+    "asr z0.s, z0.s, #0x1f\n"
+    "and z17.d, z20.d, z22.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "and z16.d, z1.d, z22.d\n"
+    "sqadd z6.s, z6.s, z3.s\n"
+    "asr z21.s, z21.s, #0x1f\n"
+    ".inst 0x448293a6  // srshl z6.s, p4/M, z6.s, z29.s\n"
+    "sqadd z9.s, z9.s, z0.s\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x448293a9  // srshl z9.s, p4/M, z9.s, z29.s\n"
+    "sqadd z7.s, z7.s, z19.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x448293a7  // srshl z7.s, p4/M, z7.s, z29.s\n"
+    "sqadd z18.s, z18.s, z21.s\n"
+    "sqadd z20.s, z20.s, z17.s\n"
+    ".inst 0x448292d2  // srshl z18.s, p4/M, z18.s, z22.s\n"
+    ".inst 0x448292d4  // srshl z20.s, p4/M, z20.s, z22.s\n"
+    "sqadd z1.s, z1.s, z16.s\n"
+    ".inst 0x453041ce  // sqxtnb z14.h, z14.s\n"
+    ".inst 0x448292c1  // srshl z1.s, p4/M, z1.s, z22.s\n"
+    ".inst 0x453040c6  // sqxtnb z6.h, z6.s\n"
+    ".inst 0x45304129  // sqxtnb z9.h, z9.s\n"
+    ".inst 0x453040e7  // sqxtnb z7.h, z7.s\n"
+    ".inst 0x453046ee  // sqxtnt z14.h, z23.s\n"
+    ".inst 0x45304646  // sqxtnt z6.h, z18.s\n"
+    ".inst 0x45304689  // sqxtnt z9.h, z20.s\n"
+    ".inst 0x45304427  // sqxtnt z7.h, z1.s\n"
+    "sqadd z14.h, z14.h, z15.h\n"
+    "smax z14.h, p4/M, z14.h, z12.h\n"
+    "smin z14.h, p4/M, z14.h, z13.h\n"
+    "sqadd z6.h, z6.h, z15.h\n"
+    "sqadd z9.h, z9.h, z15.h\n"
+    "smax z6.h, p4/M, z6.h, z12.h\n"
+    "smax z9.h, p4/M, z9.h, z12.h\n"
+    "sqadd z7.h, z7.h, z15.h\n"
+    "smax z7.h, p4/M, z7.h, z12.h\n"
+    "smin z6.h, p4/M, z6.h, z13.h\n"
+    "st1b { z14.h }, p0, [x5, x16]\n"
+    "smin z9.h, p4/M, z9.h, z13.h\n"
+    "smin z7.h, p4/M, z7.h, z13.h\n"
+    "st1b { z6.h }, p0, [x6, x16]\n"
+    "st1b { z9.h }, p0, [x7, x16]\n"
+    "st1b { z7.h }, p0, [x8, x16]\n"
+    "ld1w { z17.s }, p2/Z, [x21]\n"
+    "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+    "uzp1 z14.s, z17.s, z16.s\n"
+    "ld1b { z26.h }, p4/Z, [x4]\n"
+    "ld1b { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+    "uzp2 z23.s, z17.s, z16.s\n"
+    "addvl x21, x21, #2\n"
+    "ld1b { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+    "ld1b { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+    "inch x16\n"
+    "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1b { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+    "ldp x9, x28, [x17, #0x0]\n"
+    "mov z6.d, z14.d\n"
+    "mov z18.d, z23.d\n"
+    "ldp x27, x26, [x17, #0x10]\n"
+    "ldp x25, x24, [x17, #0x20]\n"
+    "mov z9.d, z14.d\n"
+    "mov z20.d, z23.d\n"
+    "ldp x23, x22, [x17, #0x30]\n"
+    "ldp x21, x20, [x17, #0x40]\n"
+    "mov z7.d, z14.d\n"
+    "mov z1.d, z23.d\n"
+    "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+    "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x454a1b5a  // usublb z26.h, z26.b, z10.b\n"
+    ".inst 0x454a1908  // usublb z8.h, z8.b, z10.b\n"
+    "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+    "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x454a1a10  // usublb z16.h, z16.b, z10.b\n"
+    ".inst 0x454a1ab5  // usublb z21.h, z21.b, z10.b\n"
+    "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+    "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x454a1a31  // usublb z17.h, z17.b, z10.b\n"
+    ".inst 0x455e1ad6  // usublb z22.h, z22.b, z30.b\n"
+    "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+    "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1842  // usublb z2.h, z2.b, z30.b\n"
+    ".inst 0x455e196b  // usublb z11.h, z11.b, z30.b\n"
+    "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+    "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1863  // usublb z3.h, z3.b, z30.b\n"
+    ".inst 0x455e1bbd  // usublb z29.h, z29.b, z30.b\n"
+    ".inst 0x455e1884  // usublb z4.h, z4.b, z30.b\n"
+    ".inst 0x455e1bff  // usublb z31.h, z31.b, z30.b\n"
+    ".inst 0x455e1800  // usublb z0.h, z0.b, z30.b\n"
+    ".inst 0x455e1a73  // usublb z19.h, z19.b, z30.b\n"
+    ".inst 0x455e1b9c  // usublb z28.h, z28.b, z30.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000000..d5382533a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *)
+  : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..a9cd8a7fa9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "mov x20, #0x9\n"
+    "whilelt p0.b, XZR, x20\n"
+    "ldr x23, [%x[inptrs], #0x8]\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "ldr x22, [%x[inptrs], #0x20]\n"
+    "ldr x21, [%x[inptrs], #0x0]\n"
+    "mov z13.b, #0x1\n"
+    "lsr z13.s, z13.s, #0x8\n"
+    "ld1b { z1.b }, p0/Z, [x23]\n"
+    "ld1b { z2.b }, p0/Z, [x20]\n"
+    "mov z8.d, z1.d\n"
+    "mov z27.d, z1.d\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "ld1b { z4.b }, p0/Z, [x22]\n"
+    "mov z31.d, z1.d\n"
+    "mov z28.d, z2.d\n"
+    "ld1b { z0.b }, p0/Z, [x21]\n"
+    "mov z30.d, z2.d\n"
+    "mov z26.d, z2.d\n"
+    "ld1b { z3.b }, p0/Z, [x20]\n"
+    "mov z22.d, z4.d\n"
+    "mov z10.d, z4.d\n"
+    "ptrue p2.b\n"
+    "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z18.d, z4.d\n"
+    "ext z8.b, z8.b, z8.b, #0x2\n"
+    "lsl x10, %x[n_channels], #0x2\n"
+    "neg z11.s, p2/M, z11.s\n"
+    "ext z27.b, z27.b, z27.b, #0x4\n"
+    "ext z31.b, z31.b, z31.b, #0x6\n"
+    "mov x9, #0x0\n"
+    "whilelt p0.b, x9, x10\n"
+    "ext z28.b, z28.b, z28.b, #0x2\n"
+    "ext z30.b, z30.b, z30.b, #0x4\n"
+    "ld1w { z14.s }, p0/Z, [%x[params]]\n"
+    "mov x28, #0x0\n"
+    "ext z26.b, z26.b, z26.b, #0x6\n"
+    "ext z22.b, z22.b, z22.b, #0x2\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "ext z10.b, z10.b, z10.b, #0x4\n"
+    "ext z18.b, z18.b, z18.b, #0x6\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "mov z21.d, z0.d\n"
+    "mov z20.d, z0.d\n"
+    "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "mov z19.d, z0.d\n"
+    "mov z24.d, z3.d\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "ld1b { z5.b }, p0/Z, [%x[params], #1, MUL VL]\n"
+    "mov z17.d, z3.d\n"
+    "mov z16.d, z3.d\n"
+    "ld1b { z6.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+    "ld1b { z7.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+    "ext z21.b, z21.b, z21.b, #0x2\n"
+    "ext z20.b, z20.b, z20.b, #0x4\n"
+    "addvl %x[params], %x[params], #4\n"
+    "ext z19.b, z19.b, z19.b, #0x6\n"
+    "zip1 z1.s, z1.s, z27.s\n"
+    "zip1 z8.s, z8.s, z31.s\n"
+    "zip1 z2.s, z2.s, z30.s\n"
+    "zip1 z28.s, z28.s, z26.s\n"
+    "ext z24.b, z24.b, z24.b, #0x2\n"
+    "ext z17.b, z17.b, z17.b, #0x4\n"
+    "ext z16.b, z16.b, z16.b, #0x6\n"
+    "zip1 z4.s, z4.s, z10.s\n"
+    "zip1 z22.s, z22.s, z18.s\n"
+    "zip1 z0.s, z0.s, z20.s\n"
+    "zip1 z21.s, z21.s, z19.s\n"
+    "zip1 z1.s, z1.s, z8.s\n"
+    "zip1 z2.s, z2.s, z28.s\n"
+    "zip1 z3.s, z3.s, z17.s\n"
+    "zip1 z24.s, z24.s, z16.s\n"
+    "zip1 z4.s, z4.s, z22.s\n"
+    "zip1 z0.s, z0.s, z21.s\n"
+    "mov z1.q, z1.q[0]\n"
+    "mov z2.q, z2.q[0]\n"
+    "zip1 z3.s, z3.s, z24.s\n"
+    "mov z4.q, z4.q[0]\n"
+    "mov z24.s, #0x0\n"
+    "mov z25.s, #0x0\n"
+    "udot z24.s, z13.b, z1.b[0]\n"
+    "mov z23.s, #0x0\n"
+    "mov z22.s, #0x0\n"
+    "udot z25.s, z13.b, z1.b[1]\n"
+    "mov z21.s, #0x0\n"
+    "mov z19.s, #0x0\n"
+    "udot z23.s, z13.b, z1.b[2]\n"
+    "mov z10.s, #0x0\n"
+    "mov z8.s, #0x0\n"
+    "udot z22.s, z13.b, z1.b[3]\n"
+    "mov z20.s, #0x0\n"
+    "mov z18.s, #0x0\n"
+    "udot z21.s, z13.b, z2.b[0]\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0x0\n"
+    "udot z19.s, z13.b, z2.b[1]\n"
+    "udot z10.s, z13.b, z2.b[2]\n"
+    "udot z8.s, z13.b, z2.b[3]\n"
+    "mov z0.q, z0.q[0]\n"
+    "udot z20.s, z13.b, z4.b[0]\n"
+    "udot z18.s, z13.b, z4.b[1]\n"
+    "mov z3.q, z3.q[0]\n"
+    "udot z17.s, z13.b, z4.b[2]\n"
+    "udot z16.s, z13.b, z4.b[3]\n"
+    "mov z31.s, #0x0\n"
+    "mov z30.s, #0x0\n"
+    "mov z26.s, #0x0\n"
+    "udot z31.s, z13.b, z0.b[0]\n"
+    "mov z27.s, #0x0\n"
+    "mov z28.s, #0x0\n"
+    "udot z30.s, z13.b, z0.b[1]\n"
+    "mov z29.s, #0x0\n"
+    "udot z26.s, z13.b, z0.b[2]\n"
+    "udot z27.s, z13.b, z0.b[3]\n"
+    "udot z28.s, z13.b, z3.b[0]\n"
+    "udot z29.s, z13.b, z3.b[1]\n"
+    "add z24.s, z24.s, z21.s\n"
+    "add z25.s, z25.s, z19.s\n"
+    "add z23.s, z23.s, z10.s\n"
+    "add z22.s, z22.s, z8.s\n"
+    "add z21.s, z20.s, z21.s\n"
+    "mov z20.s, #0x0\n"
+    "udot z20.s, z13.b, z3.b[2]\n"
+    "add z19.s, z18.s, z19.s\n"
+    "mov z18.s, #0x0\n"
+    "udot z18.s, z13.b, z3.b[3]\n"
+    "add z17.s, z17.s, z10.s\n"
+    "add z16.s, z16.s, z8.s\n"
+    "add z24.s, z24.s, z31.s\n"
+    "add z25.s, z25.s, z30.s\n"
+    "mul z24.s, p2/M, z24.s, z11.s\n"
+    "mul z25.s, p2/M, z25.s, z11.s\n"
+    "add z26.s, z23.s, z26.s\n"
+    "add z27.s, z22.s, z27.s\n"
+    "mul z26.s, p2/M, z26.s, z11.s\n"
+    "mul z27.s, p2/M, z27.s, z11.s\n"
+    "add z28.s, z21.s, z28.s\n"
+    "add z29.s, z19.s, z29.s\n"
+    "mul z28.s, p2/M, z28.s, z11.s\n"
+    "mul z29.s, p2/M, z29.s, z11.s\n"
+    "add z30.s, z17.s, z20.s\n"
+    "add z31.s, z16.s, z18.s\n"
+    "mul z30.s, p2/M, z30.s, z11.s\n"
+    "mul z31.s, p2/M, z31.s, z11.s\n"
+    "zip1 z19.s, z24.s, z26.s\n"
+    "zip1 z18.s, z25.s, z27.s\n"
+    "zip1 z17.s, z28.s, z30.s\n"
+    "zip1 z16.s, z29.s, z31.s\n"
+    "zip1 z22.s, z19.s, z18.s\n"
+    "zip1 z23.s, z17.s, z16.s\n"
+    "add z24.s, z24.s, z14.s\n"
+    "add z25.s, z25.s, z14.s\n"
+    "add z26.s, z26.s, z14.s\n"
+    "add z27.s, z27.s, z14.s\n"
+    "add z28.s, z28.s, z14.s\n"
+    "add z29.s, z29.s, z14.s\n"
+    "add z30.s, z30.s, z14.s\n"
+    "add z31.s, z31.s, z14.s\n"
+    "1:"  // Loop
+    "udot z24.s, z5.b, z0.b[0]\n"
+    "udot z25.s, z5.b, z0.b[1]\n"
+    "ld1w { z8.s }, p2/Z, [%x[params]]\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "udot z26.s, z5.b, z0.b[2]\n"
+    "udot z27.s, z5.b, z0.b[3]\n"
+    "incb x9\n"
+    "whilelt p1.s, x28, %x[n_channels]\n"
+    "udot z24.s, z6.b, z1.b[0]\n"
+    "udot z25.s, z6.b, z1.b[1]\n"
+    "whilelt p0.b, x9, x10\n"
+    "ld1w { z20.s }, p0/Z, [%x[params], #2, MUL VL]\n"
+    "udot z26.s, z6.b, z1.b[2]\n"
+    "udot z27.s, z6.b, z1.b[3]\n"
+    "udot z28.s, z5.b, z2.b[0]\n"
+    "udot z29.s, z5.b, z2.b[1]\n"
+    "udot z30.s, z5.b, z2.b[2]\n"
+    "udot z31.s, z5.b, z2.b[3]\n"
+    "ld1b { z5.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+    "udot z24.s, z7.b, z2.b[0]\n"
+    "udot z25.s, z7.b, z2.b[1]\n"
+    ".inst 0x04a87718  // sqrdmulh z24.s, z24.s, z8.s\n"
+    "udot z26.s, z7.b, z2.b[2]\n"
+    "udot z27.s, z7.b, z2.b[3]\n"
+    ".inst 0x04a87739  // sqrdmulh z25.s, z25.s, z8.s\n"
+    "udot z28.s, z6.b, z3.b[0]\n"
+    "udot z29.s, z6.b, z3.b[1]\n"
+    ".inst 0x04a8775a  // sqrdmulh z26.s, z26.s, z8.s\n"
+    "udot z30.s, z6.b, z3.b[2]\n"
+    "udot z31.s, z6.b, z3.b[3]\n"
+    ".inst 0x04a8777b  // sqrdmulh z27.s, z27.s, z8.s\n"
+    "ld1b { z6.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+    "udot z28.s, z7.b, z4.b[0]\n"
+    "udot z29.s, z7.b, z4.b[1]\n"
+    "and z19.d, z24.d, z21.d\n"
+    "udot z30.s, z7.b, z4.b[2]\n"
+    "udot z31.s, z7.b, z4.b[3]\n"
+    "and z18.d, z25.d, z21.d\n"
+    "ld1b { z7.b }, p0/Z, [%x[params], #5, MUL VL]\n"
+    "and z17.d, z26.d, z21.d\n"
+    "and z16.d, z27.d, z21.d\n"
+    "addvl %x[params], %x[params], #6\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04a8779c  // sqrdmulh z28.s, z28.s, z8.s\n"
+    ".inst 0x04a877bd  // sqrdmulh z29.s, z29.s, z8.s\n"
+    ".inst 0x04a877de  // sqrdmulh z30.s, z30.s, z8.s\n"
+    ".inst 0x04a877ff  // sqrdmulh z31.s, z31.s, z8.s\n"
+    "sqadd z24.s, z24.s, z19.s\n"
+    "sqadd z25.s, z25.s, z18.s\n"
+    ".inst 0x44828ab8  // srshl z24.s, p2/M, z24.s, z21.s\n"
+    ".inst 0x44828ab9  // srshl z25.s, p2/M, z25.s, z21.s\n"
+    "sqadd z26.s, z26.s, z17.s\n"
+    "sqadd z27.s, z27.s, z16.s\n"
+    ".inst 0x44828aba  // srshl z26.s, p2/M, z26.s, z21.s\n"
+    ".inst 0x44828abb  // srshl z27.s, p2/M, z27.s, z21.s\n"
+    "and z19.d, z28.d, z21.d\n"
+    "and z18.d, z29.d, z21.d\n"
+    "and z17.d, z30.d, z21.d\n"
+    "and z16.d, z31.d, z21.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z28.s, z28.s, z19.s\n"
+    "sqadd z29.s, z29.s, z18.s\n"
+    ".inst 0x44828abc  // srshl z28.s, p2/M, z28.s, z21.s\n"
+    ".inst 0x44828abd  // srshl z29.s, p2/M, z29.s, z21.s\n"
+    "sqadd z30.s, z30.s, z17.s\n"
+    "sqadd z31.s, z31.s, z16.s\n"
+    ".inst 0x44828abe  // srshl z30.s, p2/M, z30.s, z21.s\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    "add z24.s, z24.s, z9.s\n"
+    "add z25.s, z25.s, z9.s\n"
+    "smin z24.s, p2/M, z24.s, z12.s\n"
+    "smin z25.s, p2/M, z25.s, z12.s\n"
+    "add z26.s, z26.s, z9.s\n"
+    "add z27.s, z27.s, z9.s\n"
+    "smin z26.s, p2/M, z26.s, z12.s\n"
+    "smin z27.s, p2/M, z27.s, z12.s\n"
+    "add z28.s, z28.s, z9.s\n"
+    "add z29.s, z29.s, z9.s\n"
+    "smin z28.s, p2/M, z28.s, z12.s\n"
+    "smin z29.s, p2/M, z29.s, z12.s\n"
+    "add z30.s, z30.s, z9.s\n"
+    "add z31.s, z31.s, z9.s\n"
+    "smin z30.s, p2/M, z30.s, z12.s\n"
+    "smin z31.s, p2/M, z31.s, z12.s\n"
+    "smax z24.s, p2/M, z24.s, z15.s\n"
+    "smax z25.s, p2/M, z25.s, z15.s\n"
+    "st1b { z24.s }, p1, [x27, x28]\n"
+    "mov z24.s, z22.s[0]\n"
+    "smax z26.s, p2/M, z26.s, z15.s\n"
+    "smax z27.s, p2/M, z27.s, z15.s\n"
+    "st1b { z25.s }, p1, [x26, x28]\n"
+    "mov z25.s, z22.s[1]\n"
+    "smax z28.s, p2/M, z28.s, z15.s\n"
+    "smax z29.s, p2/M, z29.s, z15.s\n"
+    "st1b { z26.s }, p1, [x25, x28]\n"
+    "mov z26.s, z22.s[2]\n"
+    "smax z30.s, p2/M, z30.s, z15.s\n"
+    "smax z31.s, p2/M, z31.s, z15.s\n"
+    "st1b { z27.s }, p1, [x24, x28]\n"
+    "mov z27.s, z22.s[3]\n"
+    "st1b { z28.s }, p1, [x23, x28]\n"
+    "mov z28.s, z23.s[0]\n"
+    "add z24.s, z24.s, z20.s\n"
+    "st1b { z29.s }, p1, [x22, x28]\n"
+    "mov z29.s, z23.s[1]\n"
+    "add z25.s, z25.s, z20.s\n"
+    "st1b { z30.s }, p1, [x21, x28]\n"
+    "mov z30.s, z23.s[2]\n"
+    "add z26.s, z26.s, z20.s\n"
+    "st1b { z31.s }, p1, [x20, x28]\n"
+    "mov z31.s, z23.s[3]\n"
+    "incw x28\n"
+    "add z27.s, z27.s, z20.s\n"
+    "add z28.s, z28.s, z20.s\n"
+    "add z29.s, z29.s, z20.s\n"
+    "add z30.s, z30.s, z20.s\n"
+    "add z31.s, z31.s, z20.s\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..55b6edea2c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *)
+  : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..4b65a67309
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "mov x20, #0x6\n"
+    "whilelt p0.b, XZR, x20\n"
+    "ldr x22, [%x[inptrs], #0x18]\n"
+    "ldr x21, [%x[inptrs], #0x20]\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "ld1b { z3.b }, p0/Z, [x22]\n"
+    "mov z23.d, z3.d\n"
+    "ext z23.b, z23.b, z23.b, #0x1\n"
+    "ld1b { z4.b }, p0/Z, [x21]\n"
+    "ldr x24, [%x[inptrs], #0x8]\n"
+    "mov z18.d, z4.d\n"
+    "ext z18.b, z18.b, z18.b, #0x1\n"
+    "ld1b { z2.b }, p0/Z, [x20]\n"
+    "ldr x23, [%x[inptrs], #0x28]\n"
+    "mov z15.d, z2.d\n"
+    "ext z15.b, z15.b, z15.b, #0x1\n"
+    "ldr x22, [%x[inptrs], #0x30]\n"
+    "ldr x21, [%x[inptrs], #0x38]\n"
+    "zip1 z3.d, z3.d, z23.d\n"
+    "zip1 z4.d, z4.d, z18.d\n"
+    "ldr x20, [%x[inptrs], #0x0]\n"
+    "ld1b { z1.b }, p0/Z, [x24]\n"
+    "mov z19.d, z1.d\n"
+    "ext z19.b, z19.b, z19.b, #0x1\n"
+    "ld1b { z5.b }, p0/Z, [x23]\n"
+    "ld1b { z6.b }, p0/Z, [x22]\n"
+    "mov z18.d, z5.d\n"
+    "mov z22.d, z6.d\n"
+    "ld1b { z7.b }, p0/Z, [x21]\n"
+    "ld1b { z0.b }, p0/Z, [x20]\n"
+    "mov z8.d, z7.d\n"
+    "zip1 z2.d, z2.d, z15.d\n"
+    "mov z3.q, z3.q[0]\n"
+    "mov z4.q, z4.q[0]\n"
+    "ptrue p2.b\n"
+    "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "ext z18.b, z18.b, z18.b, #0x1\n"
+    "ext z22.b, z22.b, z22.b, #0x1\n"
+    "lsl x10, %x[n_channels], #0x2\n"
+    "neg z23.s, p2/M, z23.s\n"
+    "ext z8.b, z8.b, z8.b, #0x1\n"
+    "mov z28.b, #0x1\n"
+    "mov x9, #0x0\n"
+    "whilelt p0.b, x9, x10\n"
+    "mov z25.s, #0x0\n"
+    "mov z24.s, #0x0\n"
+    "udot z25.s, z28.b, z3.b[0]\n"
+    "ld1w { z12.s }, p0/Z, [%x[params]]\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0x0\n"
+    "udot z24.s, z28.b, z3.b[2]\n"
+    "mov x28, #0x0\n"
+    "mov z27.d, z0.d\n"
+    "udot z17.s, z28.b, z4.b[0]\n"
+    "udot z16.s, z28.b, z4.b[2]\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "ext z27.b, z27.b, z27.b, #0x1\n"
+    "zip1 z1.d, z1.d, z19.d\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "mov z2.q, z2.q[0]\n"
+    "zip1 z5.d, z5.d, z18.d\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "zip1 z6.d, z6.d, z22.d\n"
+    "zip1 z7.d, z7.d, z8.d\n"
+    "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "mov z30.s, #0x0\n"
+    "mov z31.s, #0x0\n"
+    "udot z30.s, z28.b, z2.b[0]\n"
+    "ld1b { z8.b }, p0/Z, [%x[params], #1, MUL VL]\n"
+    "mov z29.s, #0x1\n"
+    "udot z31.s, z28.b, z2.b[2]\n"
+    "udot z25.s, z29.b, z3.b[1]\n"
+    "ld1b { z9.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+    "zip1 z0.d, z0.d, z27.d\n"
+    "mov z1.q, z1.q[0]\n"
+    "udot z24.s, z29.b, z3.b[3]\n"
+    "ld1b { z10.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+    "mov z5.q, z5.q[0]\n"
+    "mov z6.q, z6.q[0]\n"
+    "udot z17.s, z29.b, z4.b[1]\n"
+    "ld1b { z11.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+    "mov z7.q, z7.q[0]\n"
+    "mov z22.s, #0x0\n"
+    "udot z16.s, z29.b, z4.b[3]\n"
+    "addvl %x[params], %x[params], #5\n"
+    "mov z21.s, #0x0\n"
+    "mov z26.s, #0x0\n"
+    "udot z22.s, z28.b, z1.b[0]\n"
+    "mov z27.s, #0x0\n"
+    "mov z20.s, #0x0\n"
+    "udot z21.s, z28.b, z1.b[2]\n"
+    "mov z19.s, #0x0\n"
+    "mov z18.s, #0x0\n"
+    "udot z26.s, z28.b, z5.b[0]\n"
+    "udot z27.s, z28.b, z5.b[2]\n"
+    "udot z20.s, z28.b, z6.b[0]\n"
+    "mov z0.q, z0.q[0]\n"
+    "udot z19.s, z28.b, z6.b[2]\n"
+    "udot z18.s, z28.b, z7.b[0]\n"
+    "add z17.s, z25.s, z17.s\n"
+    "mov z25.s, #0x0\n"
+    "udot z25.s, z28.b, z7.b[2]\n"
+    "udot z30.s, z29.b, z2.b[1]\n"
+    "udot z31.s, z29.b, z2.b[3]\n"
+    "add z16.s, z24.s, z16.s\n"
+    "udot z22.s, z29.b, z1.b[1]\n"
+    "mov z24.s, #0x0\n"
+    "udot z24.s, z28.b, z0.b[0]\n"
+    "udot z21.s, z29.b, z1.b[3]\n"
+    "udot z26.s, z29.b, z5.b[1]\n"
+    "udot z27.s, z29.b, z5.b[3]\n"
+    "add z30.s, z30.s, z17.s\n"
+    "udot z20.s, z29.b, z6.b[1]\n"
+    "udot z19.s, z29.b, z6.b[3]\n"
+    "add z31.s, z31.s, z16.s\n"
+    "udot z18.s, z29.b, z7.b[1]\n"
+    "udot z25.s, z29.b, z7.b[3]\n"
+    "add z22.s, z22.s, z30.s\n"
+    "udot z24.s, z29.b, z0.b[1]\n"
+    "add z21.s, z21.s, z31.s\n"
+    "add z20.s, z26.s, z20.s\n"
+    "add z19.s, z27.s, z19.s\n"
+    "add z18.s, z18.s, z17.s\n"
+    "mov z17.s, #0x0\n"
+    "udot z17.s, z28.b, z0.b[2]\n"
+    "udot z17.s, z29.b, z0.b[3]\n"
+    "add z16.s, z25.s, z16.s\n"
+    "add z24.s, z22.s, z24.s\n"
+    "add z25.s, z21.s, z17.s\n"
+    "mul z24.s, p2/M, z24.s, z23.s\n"
+    "mul z25.s, p2/M, z25.s, z23.s\n"
+    "add z26.s, z26.s, z22.s\n"
+    "add z27.s, z27.s, z21.s\n"
+    "mul z26.s, p2/M, z26.s, z23.s\n"
+    "mul z27.s, p2/M, z27.s, z23.s\n"
+    "add z28.s, z20.s, z30.s\n"
+    "add z29.s, z19.s, z31.s\n"
+    "mul z28.s, p2/M, z28.s, z23.s\n"
+    "mul z29.s, p2/M, z29.s, z23.s\n"
+    "add z30.s, z20.s, z18.s\n"
+    "add z31.s, z19.s, z16.s\n"
+    "mul z30.s, p2/M, z30.s, z23.s\n"
+    "mul z31.s, p2/M, z31.s, z23.s\n"
+    "zip1 z19.s, z24.s, z26.s\n"
+    "zip1 z18.s, z25.s, z27.s\n"
+    "zip1 z17.s, z28.s, z30.s\n"
+    "zip1 z16.s, z29.s, z31.s\n"
+    "zip1 z22.s, z19.s, z18.s\n"
+    "zip1 z23.s, z17.s, z16.s\n"
+    "add z24.s, z24.s, z12.s\n"
+    "add z25.s, z25.s, z12.s\n"
+    "add z26.s, z26.s, z12.s\n"
+    "add z27.s, z27.s, z12.s\n"
+    "add z28.s, z28.s, z12.s\n"
+    "add z29.s, z29.s, z12.s\n"
+    "add z30.s, z30.s, z12.s\n"
+    "add z31.s, z31.s, z12.s\n"
+    "1:"  // Loop
+    "udot z24.s, z8.b, z0.b[0]\n"
+    "udot z25.s, z8.b, z0.b[2]\n"
+    "ld1w { z12.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "udot z26.s, z8.b, z1.b[0]\n"
+    "udot z27.s, z8.b, z1.b[2]\n"
+    "incb x9\n"
+    "whilelt p1.s, x28, %x[n_channels]\n"
+    "udot z24.s, z9.b, z0.b[1]\n"
+    "udot z25.s, z9.b, z0.b[3]\n"
+    "whilelt p0.b, x9, x10\n"
+    "udot z26.s, z9.b, z1.b[1]\n"
+    "udot z27.s, z9.b, z1.b[3]\n"
+    "udot z28.s, z8.b, z2.b[0]\n"
+    "udot z29.s, z8.b, z2.b[2]\n"
+    "udot z30.s, z8.b, z3.b[0]\n"
+    "udot z31.s, z8.b, z3.b[2]\n"
+    "ld1b { z17.b }, p2/Z, [%x[params]]\n"
+    "udot z24.s, z10.b, z1.b[0]\n"
+    "udot z25.s, z10.b, z1.b[2]\n"
+    "udot z26.s, z10.b, z2.b[0]\n"
+    "udot z27.s, z10.b, z2.b[2]\n"
+    "udot z28.s, z9.b, z2.b[1]\n"
+    "udot z29.s, z9.b, z2.b[3]\n"
+    "udot z30.s, z9.b, z3.b[1]\n"
+    "udot z31.s, z9.b, z3.b[3]\n"
+    "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "udot z24.s, z11.b, z1.b[1]\n"
+    "udot z25.s, z11.b, z1.b[3]\n"
+    "udot z26.s, z11.b, z2.b[1]\n"
+    "udot z27.s, z11.b, z2.b[3]\n"
+    "udot z28.s, z10.b, z3.b[0]\n"
+    "udot z29.s, z10.b, z3.b[2]\n"
+    "udot z30.s, z10.b, z4.b[0]\n"
+    "udot z31.s, z10.b, z4.b[2]\n"
+    "ld1b { z19.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "udot z24.s, z17.b, z2.b[0]\n"
+    "udot z25.s, z17.b, z2.b[2]\n"
+    "udot z26.s, z17.b, z3.b[0]\n"
+    "udot z27.s, z17.b, z3.b[2]\n"
+    "udot z28.s, z11.b, z3.b[1]\n"
+    "udot z29.s, z11.b, z3.b[3]\n"
+    "udot z30.s, z11.b, z4.b[1]\n"
+    "udot z31.s, z11.b, z4.b[3]\n"
+    "ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "udot z24.s, z16.b, z2.b[1]\n"
+    "udot z25.s, z16.b, z2.b[3]\n"
+    "udot z26.s, z16.b, z3.b[1]\n"
+    "udot z27.s, z16.b, z3.b[3]\n"
+    "udot z28.s, z17.b, z4.b[0]\n"
+    "udot z29.s, z17.b, z4.b[2]\n"
+    "udot z30.s, z17.b, z5.b[0]\n"
+    "udot z31.s, z17.b, z5.b[2]\n"
+    "ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "udot z24.s, z19.b, z3.b[0]\n"
+    "udot z25.s, z19.b, z3.b[2]\n"
+    "udot z26.s, z19.b, z4.b[0]\n"
+    "udot z27.s, z19.b, z4.b[2]\n"
+    "udot z28.s, z16.b, z4.b[1]\n"
+    "udot z29.s, z16.b, z4.b[3]\n"
+    "udot z30.s, z16.b, z5.b[1]\n"
+    "udot z31.s, z16.b, z5.b[3]\n"
+    "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "udot z24.s, z18.b, z3.b[1]\n"
+    "udot z25.s, z18.b, z3.b[3]\n"
+    "ld1w { z20.s }, p0/Z, [%x[params], #-8, MUL VL]\n"
+    "udot z26.s, z18.b, z4.b[1]\n"
+    "udot z27.s, z18.b, z4.b[3]\n"
+    "udot z28.s, z19.b, z5.b[0]\n"
+    "udot z29.s, z19.b, z5.b[2]\n"
+    "udot z30.s, z19.b, z6.b[0]\n"
+    "udot z31.s, z19.b, z6.b[2]\n"
+    "ld1b { z10.b }, p0/Z, [%x[params], #-5, MUL VL]\n"
+    "udot z24.s, z17.b, z4.b[0]\n"
+    "udot z25.s, z17.b, z4.b[2]\n"
+    "udot z26.s, z17.b, z5.b[0]\n"
+    "udot z27.s, z17.b, z5.b[2]\n"
+    "udot z28.s, z18.b, z5.b[1]\n"
+    "udot z29.s, z18.b, z5.b[3]\n"
+    "udot z30.s, z18.b, z6.b[1]\n"
+    "udot z31.s, z18.b, z6.b[3]\n"
+    "ld1b { z11.b }, p0/Z, [%x[params], #-4, MUL VL]\n"
+    "udot z24.s, z16.b, z4.b[1]\n"
+    "udot z25.s, z16.b, z4.b[3]\n"
+    ".inst 0x04ac7718  // sqrdmulh z24.s, z24.s, z12.s\n"
+    "udot z26.s, z16.b, z5.b[1]\n"
+    "udot z27.s, z16.b, z5.b[3]\n"
+    ".inst 0x04ac7739  // sqrdmulh z25.s, z25.s, z12.s\n"
+    "udot z28.s, z17.b, z6.b[0]\n"
+    "udot z29.s, z17.b, z6.b[2]\n"
+    ".inst 0x04ac775a  // sqrdmulh z26.s, z26.s, z12.s\n"
+    "udot z30.s, z17.b, z7.b[0]\n"
+    "udot z31.s, z17.b, z7.b[2]\n"
+    ".inst 0x04ac777b  // sqrdmulh z27.s, z27.s, z12.s\n"
+    "ld1b { z8.b }, p0/Z, [%x[params], #-7, MUL VL]\n"
+    "udot z28.s, z16.b, z6.b[1]\n"
+    "udot z29.s, z16.b, z6.b[3]\n"
+    "and z19.d, z24.d, z21.d\n"
+    "udot z30.s, z16.b, z7.b[1]\n"
+    "udot z31.s, z16.b, z7.b[3]\n"
+    "and z18.d, z25.d, z21.d\n"
+    "ld1b { z9.b }, p0/Z, [%x[params], #-6, MUL VL]\n"
+    "and z17.d, z26.d, z21.d\n"
+    "and z16.d, z27.d, z21.d\n"
+    "addvl %x[params], %x[params], #-3\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04ac779c  // sqrdmulh z28.s, z28.s, z12.s\n"
+    ".inst 0x04ac77bd  // sqrdmulh z29.s, z29.s, z12.s\n"
+    ".inst 0x04ac77de  // sqrdmulh z30.s, z30.s, z12.s\n"
+    ".inst 0x04ac77ff  // sqrdmulh z31.s, z31.s, z12.s\n"
+    "sqadd z24.s, z24.s, z19.s\n"
+    "sqadd z25.s, z25.s, z18.s\n"
+    ".inst 0x44828ab8  // srshl z24.s, p2/M, z24.s, z21.s\n"
+    ".inst 0x44828ab9  // srshl z25.s, p2/M, z25.s, z21.s\n"
+    "sqadd z26.s, z26.s, z17.s\n"
+    "sqadd z27.s, z27.s, z16.s\n"
+    ".inst 0x44828aba  // srshl z26.s, p2/M, z26.s, z21.s\n"
+    ".inst 0x44828abb  // srshl z27.s, p2/M, z27.s, z21.s\n"
+    "and z19.d, z28.d, z21.d\n"
+    "and z18.d, z29.d, z21.d\n"
+    "and z17.d, z30.d, z21.d\n"
+    "and z16.d, z31.d, z21.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z28.s, z28.s, z19.s\n"
+    "sqadd z29.s, z29.s, z18.s\n"
+    ".inst 0x44828abc  // srshl z28.s, p2/M, z28.s, z21.s\n"
+    ".inst 0x44828abd  // srshl z29.s, p2/M, z29.s, z21.s\n"
+    "sqadd z30.s, z30.s, z17.s\n"
+    "sqadd z31.s, z31.s, z16.s\n"
+    ".inst 0x44828abe  // srshl z30.s, p2/M, z30.s, z21.s\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    "add z24.s, z24.s, z13.s\n"
+    "add z25.s, z25.s, z13.s\n"
+    "smin z24.s, p2/M, z24.s, z15.s\n"
+    "smin z25.s, p2/M, z25.s, z15.s\n"
+    "add z26.s, z26.s, z13.s\n"
+    "add z27.s, z27.s, z13.s\n"
+    "smin z26.s, p2/M, z26.s, z15.s\n"
+    "smin z27.s, p2/M, z27.s, z15.s\n"
+    "add z28.s, z28.s, z13.s\n"
+    "add z29.s, z29.s, z13.s\n"
+    "smin z28.s, p2/M, z28.s, z15.s\n"
+    "smin z29.s, p2/M, z29.s, z15.s\n"
+    "add z30.s, z30.s, z13.s\n"
+    "add z31.s, z31.s, z13.s\n"
+    "smin z30.s, p2/M, z30.s, z15.s\n"
+    "smin z31.s, p2/M, z31.s, z15.s\n"
+    "smax z24.s, p2/M, z24.s, z14.s\n"
+    "smax z25.s, p2/M, z25.s, z14.s\n"
+    "st1b { z24.s }, p1, [x27, x28]\n"
+    "mov z24.s, z22.s[0]\n"
+    "smax z26.s, p2/M, z26.s, z14.s\n"
+    "smax z27.s, p2/M, z27.s, z14.s\n"
+    "st1b { z25.s }, p1, [x26, x28]\n"
+    "mov z25.s, z22.s[1]\n"
+    "smax z28.s, p2/M, z28.s, z14.s\n"
+    "smax z29.s, p2/M, z29.s, z14.s\n"
+    "st1b { z26.s }, p1, [x25, x28]\n"
+    "mov z26.s, z22.s[2]\n"
+    "smax z30.s, p2/M, z30.s, z14.s\n"
+    "smax z31.s, p2/M, z31.s, z14.s\n"
+    "st1b { z27.s }, p1, [x24, x28]\n"
+    "mov z27.s, z22.s[3]\n"
+    "st1b { z28.s }, p1, [x23, x28]\n"
+    "mov z28.s, z23.s[0]\n"
+    "add z24.s, z24.s, z20.s\n"
+    "st1b { z29.s }, p1, [x22, x28]\n"
+    "mov z29.s, z23.s[1]\n"
+    "add z25.s, z25.s, z20.s\n"
+    "st1b { z30.s }, p1, [x21, x28]\n"
+    "mov z30.s, z23.s[2]\n"
+    "add z26.s, z26.s, z20.s\n"
+    "st1b { z31.s }, p1, [x20, x28]\n"
+    "mov z31.s, z23.s[3]\n"
+    "incw x28\n"
+    "add z27.s, z27.s, z20.s\n"
+    "add z28.s, z28.s, z20.s\n"
+    "add z29.s, z29.s, z20.s\n"
+    "add z30.s, z30.s, z20.s\n"
+    "add z31.s, z31.s, z20.s\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..0f1030c0d7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..887eccf1e9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x16, #0x0\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+    "ptrue p4.b\n"
+    "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "mov x23, x16\n"
+    "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+    "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z12.b }, p4/Z, [x21]\n"
+    "ld1rb { z30.b }, p4/Z, [x20]\n"
+    "add x21, x25, %[offsetof_Requantize32_minval]\n"
+    "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z24.h }, p4/Z, [x22]\n"
+    "ld1rh { z11.h }, p4/Z, [x21]\n"
+    "ld1rh { z26.h }, p4/Z, [x20]\n"
+    "ldp x13, x12, [x24, #0x0]\n"
+    "incw x23\n"
+    "whilelt p3.h, x16, x15\n"
+    "ldp x11, x10, [x24, #0x10]\n"
+    "whilelt p2.s, x16, x15\n"
+    "whilelt p1.s, x23, x15\n"
+    "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1sb { z14.h }, p4/Z, [x14]\n"
+    "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+    "add x28, %x[params], %[offsetof_Params_inptrs]\n"
+    "mov x27, #0x0\n"
+    "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+    "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+    ".inst 0x455e11ce  // ssublb z14.h, z14.b, z30.b\n"
+    ".inst 0x455e12b5  // ssublb z21.h, z21.b, z30.b\n"
+    "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+    "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+    ".inst 0x455e1021  // ssublb z1.h, z1.b, z30.b\n"
+    ".inst 0x455e10c6  // ssublb z6.h, z6.b, z30.b\n"
+    "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+    "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+    "inch x14, ALL, MUL #8\n"
+    ".inst 0x455e1042  // ssublb z2.h, z2.b, z30.b\n"
+    "ld1w { z17.s }, p2/Z, [x9]\n"
+    "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
+    "uzp1 z5.s, z17.s, z16.s\n"
+    "uzp2 z9.s, z17.s, z16.s\n"
+    "ld1sb { z8.h }, p4/Z, [x14]\n"
+    "ldp x24, x23, [x28, #0x0]\n"
+    "addvl x9, x9, #2\n"
+    "mov z17.d, z5.d\n"
+    "ldp x22, x21, [x28, #0x10]\n"
+    "ldr x20, [x28, #0x20]\n"
+    "mov z25.d, z9.d\n"
+    "mov z16.d, z5.d\n"
+    "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+    "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+    "mov z23.d, z9.d\n"
+    "mov z22.d, z5.d\n"
+    "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+    "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+    "mov z27.d, z9.d\n"
+    ".inst 0x455e1252  // ssublb z18.h, z18.b, z30.b\n"
+    "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+    "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    ".inst 0x455e10e7  // ssublb z7.h, z7.b, z30.b\n"
+    ".inst 0x455e114a  // ssublb z10.h, z10.b, z30.b\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x9, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455e1108  // ssublb z8.h, z8.b, z30.b\n"
+    ".inst 0x454c1800  // usublb z0.h, z0.b, z12.b\n"
+    ".inst 0x454c1bbd  // usublb z29.h, z29.b, z12.b\n"
+    ".inst 0x454c1884  // usublb z4.h, z4.b, z12.b\n"
+    ".inst 0x454c19ad  // usublb z13.h, z13.b, z12.b\n"
+    ".inst 0x454c1a94  // usublb z20.h, z20.b, z12.b\n"
+    "1:"  // Loop
+    ".inst 0x44824005  // smlalb z5.s, p4/M, z0.h, z2.h\n"
+    ".inst 0x44824409  // smlalt z9.s, p4/M, z0.h, z2.h\n"
+    "ldr x20, [x28, #0x28]\n"
+    "ldr x21, [x28, #0x38]\n"
+    ".inst 0x448e43a5  // smlalb z5.s, p4/M, z29.h, z14.h\n"
+    ".inst 0x44864011  // smlalb z17.s, p4/M, z0.h, z6.h\n"
+    "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+    "ldr x20, [x28, #0x30]\n"
+    ".inst 0x44954010  // smlalb z16.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x448e4016  // smlalb z22.s, p4/M, z0.h, z14.h\n"
+    "ld1b { z31.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x454c1863  // usublb z3.h, z3.b, z12.b\n"
+    ".inst 0x448e47a9  // smlalt z9.s, p4/M, z29.h, z14.h\n"
+    ".inst 0x449241a5  // smlalb z5.s, p4/M, z13.h, z18.h\n"
+    "ldr x21, [x28, #0x40]\n"
+    "ld1b { z15.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x44864419  // smlalt z25.s, p4/M, z0.h, z6.h\n"
+    ".inst 0x44954417  // smlalt z23.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x454c1bff  // usublb z31.h, z31.b, z12.b\n"
+    "ldr x20, [x28, #0x48]\n"
+    ".inst 0x448e441b  // smlalt z27.s, p4/M, z0.h, z14.h\n"
+    ".inst 0x44814091  // smlalb z17.s, p4/M, z4.h, z1.h\n"
+    "ld1b { z19.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x454c19ef  // usublb z15.h, z15.b, z12.b\n"
+    ".inst 0x448141b0  // smlalb z16.s, p4/M, z13.h, z1.h\n"
+    ".inst 0x449541b6  // smlalb z22.s, p4/M, z13.h, z21.h\n"
+    "ld1b { z28.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x454c1a73  // usublb z19.h, z19.b, z12.b\n"
+    ".inst 0x449245a9  // smlalt z9.s, p4/M, z13.h, z18.h\n"
+    ".inst 0x448a4285  // smlalb z5.s, p4/M, z20.h, z10.h\n"
+    "ldr x21, [x28, #0x50]\n"
+    "ldr x20, [x28, #0x58]\n"
+    ".inst 0x44814499  // smlalt z25.s, p4/M, z4.h, z1.h\n"
+    ".inst 0x448145b7  // smlalt z23.s, p4/M, z13.h, z1.h\n"
+    ".inst 0x454c1b9c  // usublb z28.h, z28.b, z12.b\n"
+    "ld1b { z4.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x449545bb  // smlalt z27.s, p4/M, z13.h, z21.h\n"
+    ".inst 0x448241b1  // smlalb z17.s, p4/M, z13.h, z2.h\n"
+    "ld1b { z29.h }, p3/Z, [x20, x16]\n"
+    "ldr x21, [x28, #0x60]\n"
+    ".inst 0x44874070  // smlalb z16.s, p4/M, z3.h, z7.h\n"
+    ".inst 0x44864296  // smlalb z22.s, p4/M, z20.h, z6.h\n"
+    "ldr x20, [x28, #0x68]\n"
+    ".inst 0x454c1884  // usublb z4.h, z4.b, z12.b\n"
+    ".inst 0x448a4689  // smlalt z9.s, p4/M, z20.h, z10.h\n"
+    ".inst 0x449543e5  // smlalb z5.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454c1bbd  // usublb z29.h, z29.b, z12.b\n"
+    "ld1b { z0.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x448245b9  // smlalt z25.s, p4/M, z13.h, z2.h\n"
+    ".inst 0x44874477  // smlalt z23.s, p4/M, z3.h, z7.h\n"
+    "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+    "ldr x20, [x28, #0x70]\n"
+    ".inst 0x4486469b  // smlalt z27.s, p4/M, z20.h, z6.h\n"
+    ".inst 0x44874291  // smlalb z17.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x454c1800  // usublb z0.h, z0.b, z12.b\n"
+    "ld1b { z13.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x44824290  // smlalb z16.s, p4/M, z20.h, z2.h\n"
+    ".inst 0x448841f6  // smlalb z22.s, p4/M, z15.h, z8.h\n"
+    ".inst 0x454c1863  // usublb z3.h, z3.b, z12.b\n"
+    "ldr x20, [x28, #0x78]\n"
+    ".inst 0x449547e9  // smlalt z9.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x44814265  // smlalb z5.s, p4/M, z19.h, z1.h\n"
+    ".inst 0x454c19ad  // usublb z13.h, z13.b, z12.b\n"
+    "whilelt p0.h, x27, x15\n"
+    ".inst 0x44874699  // smlalt z25.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x44824697  // smlalt z23.s, p4/M, z20.h, z2.h\n"
+    "ld1w { z20.s }, p2/Z, [x26]\n"
+    "inch x14\n"
+    ".inst 0x448845fb  // smlalt z27.s, p4/M, z15.h, z8.h\n"
+    ".inst 0x448e43f1  // smlalb z17.s, p4/M, z31.h, z14.h\n"
+    "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44924390  // smlalb z16.s, p4/M, z28.h, z18.h\n"
+    ".inst 0x44824396  // smlalb z22.s, p4/M, z28.h, z2.h\n"
+    "addvl x26, x26, #2\n"
+    ".inst 0x44814669  // smlalt z9.s, p4/M, z19.h, z1.h\n"
+    ".inst 0x44884385  // smlalb z5.s, p4/M, z28.h, z8.h\n"
+    ".inst 0x448e47f9  // smlalt z25.s, p4/M, z31.h, z14.h\n"
+    ".inst 0x44924797  // smlalt z23.s, p4/M, z28.h, z18.h\n"
+    "ld1b { z31.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x454c1bff  // usublb z31.h, z31.b, z12.b\n"
+    ".inst 0x4482479b  // smlalt z27.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x44954271  // smlalb z17.s, p4/M, z19.h, z21.h\n"
+    "uzp1 z2.s, z20.s, z15.s\n"
+    "inch x16\n"
+    ".inst 0x448e4090  // smlalb z16.s, p4/M, z4.h, z14.h\n"
+    ".inst 0x448143b6  // smlalb z22.s, p4/M, z29.h, z1.h\n"
+    "uzp2 z15.s, z20.s, z15.s\n"
+    "ld1w { z20.s }, p2/Z, [x25]\n"
+    ".inst 0x44884789  // smlalt z9.s, p4/M, z28.h, z8.h\n"
+    ".inst 0x44864085  // smlalb z5.s, p4/M, z4.h, z6.h\n"
+    "mov x20, x16\n"
+    "incw x20\n"
+    ".inst 0x44954679  // smlalt z25.s, p4/M, z19.h, z21.h\n"
+    ".inst 0x448e4497  // smlalt z23.s, p4/M, z4.h, z14.h\n"
+    "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
+    "uzp1 z21.s, z20.s, z19.s\n"
+    ".inst 0x448147bb  // smlalt z27.s, p4/M, z29.h, z1.h\n"
+    ".inst 0x448a4391  // smlalb z17.s, p4/M, z28.h, z10.h\n"
+    "uzp2 z1.s, z20.s, z19.s\n"
+    "whilelt p2.s, x16, x15\n"
+    ".inst 0x44864010  // smlalb z16.s, p4/M, z0.h, z6.h\n"
+    ".inst 0x44924076  // smlalb z22.s, p4/M, z3.h, z18.h\n"
+    "whilelt p1.s, x20, x15\n"
+    "whilelt p3.h, x16, x15\n"
+    ".inst 0x44864489  // smlalt z9.s, p4/M, z4.h, z6.h\n"
+    ".inst 0x44874005  // smlalb z5.s, p4/M, z0.h, z7.h\n"
+    ".inst 0x04a274a5  // sqrdmulh z5.s, z5.s, z2.s\n"
+    "addvl x25, x25, #2\n"
+    ".inst 0x448a4799  // smlalt z25.s, p4/M, z28.h, z10.h\n"
+    ".inst 0x44864417  // smlalt z23.s, p4/M, z0.h, z6.h\n"
+    "and z19.d, z5.d, z21.d\n"
+    ".inst 0x4492447b  // smlalt z27.s, p4/M, z3.h, z18.h\n"
+    ".inst 0x449243b1  // smlalb z17.s, p4/M, z29.h, z18.h\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    ".inst 0x448a41b0  // smlalb z16.s, p4/M, z13.h, z10.h\n"
+    ".inst 0x448741b6  // smlalb z22.s, p4/M, z13.h, z7.h\n"
+    "sqadd z5.s, z5.s, z19.s\n"
+    ".inst 0x448292a5  // srshl z5.s, p4/M, z5.s, z21.s\n"
+    ".inst 0x44874409  // smlalt z9.s, p4/M, z0.h, z7.h\n"
+    ".inst 0x449247b9  // smlalt z25.s, p4/M, z29.h, z18.h\n"
+    ".inst 0x04af7529  // sqrdmulh z9.s, z9.s, z15.s\n"
+    ".inst 0x448a45b7  // smlalt z23.s, p4/M, z13.h, z10.h\n"
+    ".inst 0x448745bb  // smlalt z27.s, p4/M, z13.h, z7.h\n"
+    "and z29.d, z9.d, z1.d\n"
+    ".inst 0x44884071  // smlalb z17.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x448843f0  // smlalb z16.s, p4/M, z31.h, z8.h\n"
+    ".inst 0x04a27631  // sqrdmulh z17.s, z17.s, z2.s\n"
+    ".inst 0x448a43f6  // smlalb z22.s, p4/M, z31.h, z10.h\n"
+    ".inst 0x44884479  // smlalt z25.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x04a27610  // sqrdmulh z16.s, z16.s, z2.s\n"
+    ".inst 0x448847f7  // smlalt z23.s, p4/M, z31.h, z8.h\n"
+    ".inst 0x448a47fb  // smlalt z27.s, p4/M, z31.h, z10.h\n"
+    ".inst 0x04a276d6  // sqrdmulh z22.s, z22.s, z2.s\n"
+    "asr z29.s, z29.s, #0x1f\n"
+    "and z18.d, z17.d, z21.d\n"
+    ".inst 0x04af7739  // sqrdmulh z25.s, z25.s, z15.s\n"
+    "and z20.d, z16.d, z21.d\n"
+    ".inst 0x04af76f7  // sqrdmulh z23.s, z23.s, z15.s\n"
+    "and z19.d, z22.d, z21.d\n"
+    ".inst 0x04af777b  // sqrdmulh z27.s, z27.s, z15.s\n"
+    "sqadd z9.s, z9.s, z29.s\n"
+    ".inst 0x44829029  // srshl z9.s, p4/M, z9.s, z1.s\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "and z7.d, z25.d, z1.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "and z6.d, z23.d, z1.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "and z2.d, z27.d, z1.d\n"
+    "sqadd z17.s, z17.s, z18.s\n"
+    "asr z7.s, z7.s, #0x1f\n"
+    ".inst 0x448292b1  // srshl z17.s, p4/M, z17.s, z21.s\n"
+    "sqadd z16.s, z16.s, z20.s\n"
+    "asr z6.s, z6.s, #0x1f\n"
+    ".inst 0x448292b0  // srshl z16.s, p4/M, z16.s, z21.s\n"
+    "sqadd z22.s, z22.s, z19.s\n"
+    "asr z2.s, z2.s, #0x1f\n"
+    ".inst 0x448292b6  // srshl z22.s, p4/M, z22.s, z21.s\n"
+    "sqadd z25.s, z25.s, z7.s\n"
+    "sqadd z23.s, z23.s, z6.s\n"
+    ".inst 0x44829039  // srshl z25.s, p4/M, z25.s, z1.s\n"
+    ".inst 0x44829037  // srshl z23.s, p4/M, z23.s, z1.s\n"
+    "sqadd z27.s, z27.s, z2.s\n"
+    ".inst 0x453040a5  // sqxtnb z5.h, z5.s\n"
+    ".inst 0x4482903b  // srshl z27.s, p4/M, z27.s, z1.s\n"
+    ".inst 0x45304231  // sqxtnb z17.h, z17.s\n"
+    ".inst 0x45304210  // sqxtnb z16.h, z16.s\n"
+    ".inst 0x453042d6  // sqxtnb z22.h, z22.s\n"
+    ".inst 0x45304525  // sqxtnt z5.h, z9.s\n"
+    ".inst 0x45304731  // sqxtnt z17.h, z25.s\n"
+    ".inst 0x453046f0  // sqxtnt z16.h, z23.s\n"
+    ".inst 0x45304776  // sqxtnt z22.h, z27.s\n"
+    "sqadd z5.h, z5.h, z24.h\n"
+    "smax z5.h, p4/M, z5.h, z11.h\n"
+    "smin z5.h, p4/M, z5.h, z26.h\n"
+    "sqadd z17.h, z17.h, z24.h\n"
+    "sqadd z16.h, z16.h, z24.h\n"
+    "smax z17.h, p4/M, z17.h, z11.h\n"
+    "smax z16.h, p4/M, z16.h, z11.h\n"
+    "sqadd z22.h, z22.h, z24.h\n"
+    "smax z22.h, p4/M, z22.h, z11.h\n"
+    "smin z17.h, p4/M, z17.h, z26.h\n"
+    "st1b { z5.h }, p0, [x13, x27]\n"
+    "smin z16.h, p4/M, z16.h, z26.h\n"
+    "smin z22.h, p4/M, z22.h, z26.h\n"
+    "st1b { z17.h }, p0, [x12, x27]\n"
+    "st1b { z16.h }, p0, [x11, x27]\n"
+    "st1b { z22.h }, p0, [x10, x27]\n"
+    "ld1sb { z14.h }, p4/Z, [x14]\n"
+    "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+    "inch x27\n"
+    "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+    "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+    ".inst 0x455e11ce  // ssublb z14.h, z14.b, z30.b\n"
+    ".inst 0x455e12b5  // ssublb z21.h, z21.b, z30.b\n"
+    "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+    "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+    ".inst 0x455e1021  // ssublb z1.h, z1.b, z30.b\n"
+    ".inst 0x455e10c6  // ssublb z6.h, z6.b, z30.b\n"
+    "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+    "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+    "inch x14, ALL, MUL #8\n"
+    ".inst 0x455e1042  // ssublb z2.h, z2.b, z30.b\n"
+    "ld1w { z17.s }, p2/Z, [x21]\n"
+    "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+    "uzp1 z5.s, z17.s, z16.s\n"
+    "uzp2 z9.s, z17.s, z16.s\n"
+    "ld1sb { z8.h }, p4/Z, [x14]\n"
+    "ldp x24, x23, [x28, #0x0]\n"
+    "addvl x21, x21, #2\n"
+    "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x22, x21, [x28, #0x10]\n"
+    "ldr x20, [x28, #0x20]\n"
+    "mov z17.d, z5.d\n"
+    "mov z25.d, z9.d\n"
+    "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+    "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+    "mov z16.d, z5.d\n"
+    "mov z23.d, z9.d\n"
+    "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+    "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+    "mov z22.d, z5.d\n"
+    "mov z27.d, z9.d\n"
+    "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x455e1252  // ssublb z18.h, z18.b, z30.b\n"
+    ".inst 0x455e10e7  // ssublb z7.h, z7.b, z30.b\n"
+    ".inst 0x455e114a  // ssublb z10.h, z10.b, z30.b\n"
+    ".inst 0x455e1108  // ssublb z8.h, z8.b, z30.b\n"
+    ".inst 0x454c1800  // usublb z0.h, z0.b, z12.b\n"
+    ".inst 0x454c1bbd  // usublb z29.h, z29.b, z12.b\n"
+    ".inst 0x454c1884  // usublb z4.h, z4.b, z12.b\n"
+    ".inst 0x454c19ad  // usublb z13.h, z13.b, z12.b\n"
+    ".inst 0x454c1a94  // usublb z20.h, z20.b, z12.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..79e3fd5f54
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..754d06d443
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x7, #0x0\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+    "ptrue p4.b\n"
+    "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "mov x23, x7\n"
+    "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+    "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z26.b }, p4/Z, [x21]\n"
+    "ld1rb { z13.b }, p4/Z, [x20]\n"
+    "add x21, x25, %[offsetof_Requantize32_minval]\n"
+    "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z19.h }, p4/Z, [x22]\n"
+    "ld1rh { z12.h }, p4/Z, [x21]\n"
+    "ld1rh { z9.h }, p4/Z, [x20]\n"
+    "ldp x16, x15, [x24, #0x0]\n"
+    "incw x23\n"
+    "whilelt p3.h, x7, x8\n"
+    "ldp x14, x13, [x24, #0x10]\n"
+    "whilelt p2.s, x7, x8\n"
+    "whilelt p1.s, x23, x8\n"
+    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1sb { z25.h }, p4/Z, [x17]\n"
+    "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+    "add x11, %x[params], %[offsetof_Params_inptrs]\n"
+    "mov x10, #0x0\n"
+    "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+    "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+    ".inst 0x454d1339  // ssublb z25.h, z25.b, z13.b\n"
+    ".inst 0x454d13de  // ssublb z30.h, z30.b, z13.b\n"
+    "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+    "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+    ".inst 0x454d11ce  // ssublb z14.h, z14.b, z13.b\n"
+    ".inst 0x454d1084  // ssublb z4.h, z4.b, z13.b\n"
+    "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+    "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+    "inch x17, ALL, MUL #8\n"
+    ".inst 0x454d114a  // ssublb z10.h, z10.b, z13.b\n"
+    "ld1w { z17.s }, p2/Z, [x12]\n"
+    "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
+    "uzp1 z8.s, z17.s, z16.s\n"
+    "uzp2 z24.s, z17.s, z16.s\n"
+    "ld1sb { z2.h }, p4/Z, [x17]\n"
+    "ldp x27, x26, [x11, #0x0]\n"
+    "addvl x12, x12, #2\n"
+    "mov z18.d, z8.d\n"
+    "ldp x25, x24, [x11, #0x10]\n"
+    "ldp x23, x22, [x11, #0x20]\n"
+    "mov z0.d, z24.d\n"
+    "mov z15.d, z8.d\n"
+    "ldp x21, x20, [x11, #0x30]\n"
+    "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+    "mov z1.d, z24.d\n"
+    "mov z5.d, z8.d\n"
+    "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+    "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+    "mov z6.d, z24.d\n"
+    ".inst 0x454d1063  // ssublb z3.h, z3.b, z13.b\n"
+    "ld1b { z20.h }, p3/Z, [x24, x7]\n"
+    "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x454d12f7  // ssublb z23.h, z23.b, z13.b\n"
+    ".inst 0x454d10e7  // ssublb z7.h, z7.b, z13.b\n"
+    "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+    "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x454d1042  // ssublb z2.h, z2.b, z13.b\n"
+    ".inst 0x455a1ab5  // usublb z21.h, z21.b, z26.b\n"
+    "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+    "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    ".inst 0x455a1ad6  // usublb z22.h, z22.b, z26.b\n"
+    ".inst 0x455a196b  // usublb z11.h, z11.b, z26.b\n"
+    "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455a1a94  // usublb z20.h, z20.b, z26.b\n"
+    ".inst 0x455a1b7b  // usublb z27.h, z27.b, z26.b\n"
+    ".inst 0x455a1b9c  // usublb z28.h, z28.b, z26.b\n"
+    ".inst 0x455a1a10  // usublb z16.h, z16.b, z26.b\n"
+    ".inst 0x455a1bff  // usublb z31.h, z31.b, z26.b\n"
+    "1:"  // Loop
+    ".inst 0x448242a8  // smlalb z8.s, p4/M, z21.h, z2.h\n"
+    "ldr x21, [x11, #0x58]\n"
+    "ldr x20, [x11, #0x78]\n"
+    ".inst 0x448246b8  // smlalt z24.s, p4/M, z21.h, z2.h\n"
+    ".inst 0x449942c8  // smlalb z8.s, p4/M, z22.h, z25.h\n"
+    "ld1b { z17.h }, p3/Z, [x21, x7]\n"
+    "ld1b { z29.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x449742b2  // smlalb z18.s, p4/M, z21.h, z23.h\n"
+    "ldr x21, [x11, #0x60]\n"
+    "ldr x20, [x11, #0x80]\n"
+    ".inst 0x448e42af  // smlalb z15.s, p4/M, z21.h, z14.h\n"
+    ".inst 0x449942a5  // smlalb z5.s, p4/M, z21.h, z25.h\n"
+    ".inst 0x449946d8  // smlalt z24.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x455a1a31  // usublb z17.h, z17.b, z26.b\n"
+    ".inst 0x449e4168  // smlalb z8.s, p4/M, z11.h, z30.h\n"
+    "ld1b { z22.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x455a1bbd  // usublb z29.h, z29.b, z26.b\n"
+    ".inst 0x449746a0  // smlalt z0.s, p4/M, z21.h, z23.h\n"
+    ".inst 0x448e46a1  // smlalt z1.s, p4/M, z21.h, z14.h\n"
+    "ldr x21, [x11, #0x68]\n"
+    ".inst 0x449946a6  // smlalt z6.s, p4/M, z21.h, z25.h\n"
+    "ld1b { z21.h }, p3/Z, [x20, x7]\n"
+    "ldr x20, [x11, #0x88]\n"
+    ".inst 0x449e4292  // smlalb z18.s, p4/M, z20.h, z30.h\n"
+    ".inst 0x4484422f  // smlalb z15.s, p4/M, z17.h, z4.h\n"
+    ".inst 0x448a43a5  // smlalb z5.s, p4/M, z29.h, z10.h\n"
+    ".inst 0x455a1ad6  // usublb z22.h, z22.b, z26.b\n"
+    "ldr x22, [x11, #0x40]\n"
+    ".inst 0x449e4578  // smlalt z24.s, p4/M, z11.h, z30.h\n"
+    ".inst 0x455a1ab5  // usublb z21.h, z21.b, z26.b\n"
+    ".inst 0x44844388  // smlalb z8.s, p4/M, z28.h, z4.h\n"
+    "ld1b { z11.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x449e4680  // smlalt z0.s, p4/M, z20.h, z30.h\n"
+    "ld1b { z20.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x44844621  // smlalt z1.s, p4/M, z17.h, z4.h\n"
+    "ldr x21, [x11, #0x70]\n"
+    ".inst 0x448a47a6  // smlalt z6.s, p4/M, z29.h, z10.h\n"
+    "ldr x20, [x11, #0x98]\n"
+    ".inst 0x448e4372  // smlalb z18.s, p4/M, z27.h, z14.h\n"
+    "ldr x23, [x11, #0x50]\n"
+    ".inst 0x449942cf  // smlalb z15.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x449e42a5  // smlalb z5.s, p4/M, z21.h, z30.h\n"
+    ".inst 0x455a196b  // usublb z11.h, z11.b, z26.b\n"
+    "ld1b { z17.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x44844798  // smlalt z24.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x455a1a94  // usublb z20.h, z20.b, z26.b\n"
+    ".inst 0x448a4208  // smlalb z8.s, p4/M, z16.h, z10.h\n"
+    "ld1b { z29.h }, p3/Z, [x21, x7]\n"
+    "ld1b { z28.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x448e4760  // smlalt z0.s, p4/M, z27.h, z14.h\n"
+    "ldr x22, [x11, #0x48]\n"
+    ".inst 0x449946c1  // smlalt z1.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x449e46a6  // smlalt z6.s, p4/M, z21.h, z30.h\n"
+    "ldr x21, [x11, #0x90]\n"
+    "ldr x20, [x11, #0xa8]\n"
+    ".inst 0x449943f2  // smlalb z18.s, p4/M, z31.h, z25.h\n"
+    "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x448a416f  // smlalb z15.s, p4/M, z11.h, z10.h\n"
+    ".inst 0x44834285  // smlalb z5.s, p4/M, z20.h, z3.h\n"
+    ".inst 0x455a1a31  // usublb z17.h, z17.b, z26.b\n"
+    ".inst 0x448a4618  // smlalt z24.s, p4/M, z16.h, z10.h\n"
+    ".inst 0x455a1bbd  // usublb z29.h, z29.b, z26.b\n"
+    ".inst 0x448e43e8  // smlalb z8.s, p4/M, z31.h, z14.h\n"
+    "ld1b { z16.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x455a1b9c  // usublb z28.h, z28.b, z26.b\n"
+    ".inst 0x449947e0  // smlalt z0.s, p4/M, z31.h, z25.h\n"
+    "ld1b { z25.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x448a4561  // smlalt z1.s, p4/M, z11.h, z10.h\n"
+    "ld1b { z11.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x455a1b7b  // usublb z27.h, z27.b, z26.b\n"
+    ".inst 0x44834686  // smlalt z6.s, p4/M, z20.h, z3.h\n"
+    "ldr x21, [x11, #0xa0]\n"
+    "ldr x20, [x11, #0xb0]\n"
+    ".inst 0x448a4232  // smlalb z18.s, p4/M, z17.h, z10.h\n"
+    ".inst 0x449e43af  // smlalb z15.s, p4/M, z29.h, z30.h\n"
+    ".inst 0x455a1a10  // usublb z16.h, z16.b, z26.b\n"
+    ".inst 0x448e4385  // smlalb z5.s, p4/M, z28.h, z14.h\n"
+    ".inst 0x448e47f8  // smlalt z24.s, p4/M, z31.h, z14.h\n"
+    ".inst 0x455a1b39  // usublb z25.h, z25.b, z26.b\n"
+    "ld1b { z20.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x455a196b  // usublb z11.h, z11.b, z26.b\n"
+    ".inst 0x44834368  // smlalb z8.s, p4/M, z27.h, z3.h\n"
+    "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x448a4620  // smlalt z0.s, p4/M, z17.h, z10.h\n"
+    ".inst 0x449e47a1  // smlalt z1.s, p4/M, z29.h, z30.h\n"
+    ".inst 0x448e4786  // smlalt z6.s, p4/M, z28.h, z14.h\n"
+    "ldr x20, [x11, #0xb8]\n"
+    ".inst 0x455a1a94  // usublb z20.h, z20.b, z26.b\n"
+    ".inst 0x44834212  // smlalb z18.s, p4/M, z16.h, z3.h\n"
+    ".inst 0x4497432f  // smlalb z15.s, p4/M, z25.h, z23.h\n"
+    ".inst 0x455a1bff  // usublb z31.h, z31.b, z26.b\n"
+    "ld1b { z30.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x44844165  // smlalb z5.s, p4/M, z11.h, z4.h\n"
+    ".inst 0x44834778  // smlalt z24.s, p4/M, z27.h, z3.h\n"
+    "ldr x20, [x11, #0xc0]\n"
+    "ld1w { z17.s }, p2/Z, [x9]\n"
+    ".inst 0x449742c8  // smlalb z8.s, p4/M, z22.h, z23.h\n"
+    ".inst 0x44834600  // smlalt z0.s, p4/M, z16.h, z3.h\n"
+    "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
+    ".inst 0x455a1bde  // usublb z30.h, z30.b, z26.b\n"
+    ".inst 0x44974721  // smlalt z1.s, p4/M, z25.h, z23.h\n"
+    ".inst 0x44844566  // smlalt z6.s, p4/M, z11.h, z4.h\n"
+    "ld1b { z25.h }, p3/Z, [x20, x7]\n"
+    "uzp1 z10.s, z17.s, z14.s\n"
+    ".inst 0x44844372  // smlalb z18.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x4487428f  // smlalb z15.s, p4/M, z20.h, z7.h\n"
+    "uzp2 z14.s, z17.s, z14.s\n"
+    "ld1w { z17.s }, p2/Z, [x28]\n"
+    ".inst 0x448743e5  // smlalb z5.s, p4/M, z31.h, z7.h\n"
+    ".inst 0x449746d8  // smlalt z24.s, p4/M, z22.h, z23.h\n"
+    "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+    ".inst 0x455a1b39  // usublb z25.h, z25.b, z26.b\n"
+    ".inst 0x448743a8  // smlalb z8.s, p4/M, z29.h, z7.h\n"
+    ".inst 0x44844760  // smlalt z0.s, p4/M, z27.h, z4.h\n"
+    "uzp1 z4.s, z17.s, z16.s\n"
+    "inch x7\n"
+    ".inst 0x44874681  // smlalt z1.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x448747e6  // smlalt z6.s, p4/M, z31.h, z7.h\n"
+    ".inst 0x04aa7508  // sqrdmulh z8.s, z8.s, z10.s\n"
+    "whilelt p0.h, x10, x8\n"
+    ".inst 0x448742b2  // smlalb z18.s, p4/M, z21.h, z7.h\n"
+    ".inst 0x4483416f  // smlalb z15.s, p4/M, z11.h, z3.h\n"
+    "uzp2 z22.s, z17.s, z16.s\n"
+    "mov x20, x7\n"
+    ".inst 0x449743c5  // smlalb z5.s, p4/M, z30.h, z23.h\n"
+    ".inst 0x448747b8  // smlalt z24.s, p4/M, z29.h, z7.h\n"
+    "and z17.d, z8.d, z4.d\n"
+    "inch x17\n"
+    ".inst 0x448746a0  // smlalt z0.s, p4/M, z21.h, z7.h\n"
+    ".inst 0x44834561  // smlalt z1.s, p4/M, z11.h, z3.h\n"
+    ".inst 0x04ae7718  // sqrdmulh z24.s, z24.s, z14.s\n"
+    "incw x20\n"
+    ".inst 0x449747c6  // smlalt z6.s, p4/M, z30.h, z23.h\n"
+    ".inst 0x44824392  // smlalb z18.s, p4/M, z28.h, z2.h\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "whilelt p2.s, x7, x8\n"
+    ".inst 0x448243cf  // smlalb z15.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x44824325  // smlalb z5.s, p4/M, z25.h, z2.h\n"
+    "and z16.d, z24.d, z22.d\n"
+    "whilelt p1.s, x20, x8\n"
+    ".inst 0x44824780  // smlalt z0.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x448247c1  // smlalt z1.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x04aa7652  // sqrdmulh z18.s, z18.s, z10.s\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44824726  // smlalt z6.s, p4/M, z25.h, z2.h\n"
+    ".inst 0x04aa75ef  // sqrdmulh z15.s, z15.s, z10.s\n"
+    "whilelt p3.h, x7, x8\n"
+    "addvl x9, x9, #2\n"
+    ".inst 0x04aa74a5  // sqrdmulh z5.s, z5.s, z10.s\n"
+    "sqadd z8.s, z8.s, z17.s\n"
+    ".inst 0x44829088  // srshl z8.s, p4/M, z8.s, z4.s\n"
+    "addvl x28, x28, #2\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "and z21.d, z18.d, z4.d\n"
+    ".inst 0x04ae7400  // sqrdmulh z0.s, z0.s, z14.s\n"
+    "and z20.d, z15.d, z4.d\n"
+    ".inst 0x04ae7421  // sqrdmulh z1.s, z1.s, z14.s\n"
+    "and z28.d, z5.d, z4.d\n"
+    ".inst 0x04ae74c6  // sqrdmulh z6.s, z6.s, z14.s\n"
+    "sqadd z24.s, z24.s, z16.s\n"
+    ".inst 0x448292d8  // srshl z24.s, p4/M, z24.s, z22.s\n"
+    "asr z21.s, z21.s, #0x1f\n"
+    "and z25.d, z0.d, z22.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "and z17.d, z1.d, z22.d\n"
+    "asr z28.s, z28.s, #0x1f\n"
+    "and z16.d, z6.d, z22.d\n"
+    "sqadd z18.s, z18.s, z21.s\n"
+    "asr z25.s, z25.s, #0x1f\n"
+    ".inst 0x44829092  // srshl z18.s, p4/M, z18.s, z4.s\n"
+    "sqadd z15.s, z15.s, z20.s\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x4482908f  // srshl z15.s, p4/M, z15.s, z4.s\n"
+    "sqadd z5.s, z5.s, z28.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x44829085  // srshl z5.s, p4/M, z5.s, z4.s\n"
+    "sqadd z0.s, z0.s, z25.s\n"
+    "sqadd z1.s, z1.s, z17.s\n"
+    ".inst 0x448292c0  // srshl z0.s, p4/M, z0.s, z22.s\n"
+    ".inst 0x448292c1  // srshl z1.s, p4/M, z1.s, z22.s\n"
+    "sqadd z6.s, z6.s, z16.s\n"
+    ".inst 0x45304108  // sqxtnb z8.h, z8.s\n"
+    ".inst 0x448292c6  // srshl z6.s, p4/M, z6.s, z22.s\n"
+    ".inst 0x45304252  // sqxtnb z18.h, z18.s\n"
+    ".inst 0x453041ef  // sqxtnb z15.h, z15.s\n"
+    ".inst 0x453040a5  // sqxtnb z5.h, z5.s\n"
+    ".inst 0x45304708  // sqxtnt z8.h, z24.s\n"
+    ".inst 0x45304412  // sqxtnt z18.h, z0.s\n"
+    ".inst 0x4530442f  // sqxtnt z15.h, z1.s\n"
+    ".inst 0x453044c5  // sqxtnt z5.h, z6.s\n"
+    "sqadd z8.h, z8.h, z19.h\n"
+    "smax z8.h, p4/M, z8.h, z12.h\n"
+    "smin z8.h, p4/M, z8.h, z9.h\n"
+    "sqadd z18.h, z18.h, z19.h\n"
+    "sqadd z15.h, z15.h, z19.h\n"
+    "smax z18.h, p4/M, z18.h, z12.h\n"
+    "smax z15.h, p4/M, z15.h, z12.h\n"
+    "sqadd z5.h, z5.h, z19.h\n"
+    "smax z5.h, p4/M, z5.h, z12.h\n"
+    "smin z18.h, p4/M, z18.h, z9.h\n"
+    "st1b { z8.h }, p0, [x16, x10]\n"
+    "smin z15.h, p4/M, z15.h, z9.h\n"
+    "smin z5.h, p4/M, z5.h, z9.h\n"
+    "st1b { z18.h }, p0, [x15, x10]\n"
+    "st1b { z15.h }, p0, [x14, x10]\n"
+    "st1b { z5.h }, p0, [x13, x10]\n"
+    "ld1sb { z25.h }, p4/Z, [x17]\n"
+    "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+    "inch x10\n"
+    "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+    "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+    ".inst 0x454d1339  // ssublb z25.h, z25.b, z13.b\n"
+    ".inst 0x454d13de  // ssublb z30.h, z30.b, z13.b\n"
+    "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+    "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+    ".inst 0x454d11ce  // ssublb z14.h, z14.b, z13.b\n"
+    ".inst 0x454d1084  // ssublb z4.h, z4.b, z13.b\n"
+    "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+    "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+    "inch x17, ALL, MUL #8\n"
+    ".inst 0x454d114a  // ssublb z10.h, z10.b, z13.b\n"
+    "ld1w { z17.s }, p2/Z, [x20]\n"
+    "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+    "uzp1 z8.s, z17.s, z16.s\n"
+    "uzp2 z24.s, z17.s, z16.s\n"
+    "ld1sb { z2.h }, p4/Z, [x17]\n"
+    "ldp x27, x26, [x11, #0x0]\n"
+    "addvl x20, x20, #2\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x25, x24, [x11, #0x10]\n"
+    "ldp x23, x22, [x11, #0x20]\n"
+    "mov z18.d, z8.d\n"
+    "mov z0.d, z24.d\n"
+    "ldp x21, x20, [x11, #0x30]\n"
+    "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+    "mov z15.d, z8.d\n"
+    "mov z1.d, z24.d\n"
+    "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+    "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+    "mov z5.d, z8.d\n"
+    "mov z6.d, z24.d\n"
+    "ld1b { z20.h }, p3/Z, [x24, x7]\n"
+    "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x454d1063  // ssublb z3.h, z3.b, z13.b\n"
+    ".inst 0x454d12f7  // ssublb z23.h, z23.b, z13.b\n"
+    "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+    "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x454d10e7  // ssublb z7.h, z7.b, z13.b\n"
+    ".inst 0x454d1042  // ssublb z2.h, z2.b, z13.b\n"
+    "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x455a1ab5  // usublb z21.h, z21.b, z26.b\n"
+    ".inst 0x455a1ad6  // usublb z22.h, z22.b, z26.b\n"
+    ".inst 0x455a196b  // usublb z11.h, z11.b, z26.b\n"
+    ".inst 0x455a1a94  // usublb z20.h, z20.b, z26.b\n"
+    ".inst 0x455a1b7b  // usublb z27.h, z27.b, z26.b\n"
+    ".inst 0x455a1b9c  // usublb z28.h, z28.b, z26.b\n"
+    ".inst 0x455a1a10  // usublb z16.h, z16.b, z26.b\n"
+    ".inst 0x455a1bff  // usublb z31.h, z31.b, z26.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..0ff853ec2d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..f24a258484
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x2, #0x0\n"
+    "mov x24, x2\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ptrue p4.b\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "incw x24\n"
+    "ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x21, x23, %[offsetof_Requantize32_a_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+    "ld1rb { z30.b }, p4/Z, [x21]\n"
+    "ld1rb { z10.b }, p4/Z, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_minval]\n"
+    "ld1rh { z15.h }, p4/Z, [x21]\n"
+    "ld1rh { z12.h }, p4/Z, [x20]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z13.h }, p4/Z, [x20]\n"
+    "ldp x5, x6, [x22, #0x0]\n"
+    "whilelt p3.h, x2, x3\n"
+    "ldp x7, x8, [x22, #0x10]\n"
+    "whilelt p2.s, x2, x3\n"
+    "whilelt p1.s, x24, x3\n"
+    "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
+    "add x17, %x[params], %[offsetof_Params_inptrs]\n"
+    "ld1w { z17.s }, p2/Z, [x10]\n"
+    "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+    "uzp1 z14.s, z17.s, z16.s\n"
+    "ld1sb { z26.h }, p4/Z, [x4]\n"
+    "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+    "uzp2 z23.s, z17.s, z16.s\n"
+    "addvl x10, x10, #2\n"
+    "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+    "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+    "mov x16, #0x0\n"
+    "mov z6.d, z14.d\n"
+    "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+    "ldp x9, x28, [x17, #0x0]\n"
+    "mov z18.d, z23.d\n"
+    "mov z9.d, z14.d\n"
+    "ldp x27, x26, [x17, #0x10]\n"
+    "ldp x25, x24, [x17, #0x20]\n"
+    "mov z20.d, z23.d\n"
+    "mov z7.d, z14.d\n"
+    "ldp x23, x22, [x17, #0x30]\n"
+    "ldp x21, x20, [x17, #0x40]\n"
+    "mov z1.d, z23.d\n"
+    ".inst 0x454a135a  // ssublb z26.h, z26.b, z10.b\n"
+    "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+    "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x454a1108  // ssublb z8.h, z8.b, z10.b\n"
+    ".inst 0x454a1210  // ssublb z16.h, z16.b, z10.b\n"
+    "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+    "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    ".inst 0x454a1231  // ssublb z17.h, z17.b, z10.b\n"
+    "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+    "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x455e1ad6  // usublb z22.h, z22.b, z30.b\n"
+    ".inst 0x455e1842  // usublb z2.h, z2.b, z30.b\n"
+    "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+    "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e196b  // usublb z11.h, z11.b, z30.b\n"
+    ".inst 0x455e1863  // usublb z3.h, z3.b, z30.b\n"
+    "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+    "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1bbd  // usublb z29.h, z29.b, z30.b\n"
+    ".inst 0x455e1884  // usublb z4.h, z4.b, z30.b\n"
+    "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x10, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455e1bff  // usublb z31.h, z31.b, z30.b\n"
+    ".inst 0x455e1800  // usublb z0.h, z0.b, z30.b\n"
+    ".inst 0x455e1a73  // usublb z19.h, z19.b, z30.b\n"
+    ".inst 0x455e1b9c  // usublb z28.h, z28.b, z30.b\n"
+    "1:"  // Loop
+    ".inst 0x449a42ce  // smlalb z14.s, p4/M, z22.h, z26.h\n"
+    ".inst 0x449a46d7  // smlalt z23.s, p4/M, z22.h, z26.h\n"
+    "ldr x20, [x17, #0x50]\n"
+    "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x4488404e  // smlalb z14.s, p4/M, z2.h, z8.h\n"
+    ".inst 0x449a4046  // smlalb z6.s, p4/M, z2.h, z26.h\n"
+    "ldr x20, [x17, #0x58]\n"
+    ".inst 0x455e1b7b  // usublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449a4169  // smlalb z9.s, p4/M, z11.h, z26.h\n"
+    ".inst 0x449a4067  // smlalb z7.s, p4/M, z3.h, z26.h\n"
+    "ld1b { z5.h }, p3/Z, [x20, x2]\n"
+    "ldr x20, [x17, #0x60]\n"
+    ".inst 0x44884457  // smlalt z23.s, p4/M, z2.h, z8.h\n"
+    ".inst 0x449043ae  // smlalb z14.s, p4/M, z29.h, z16.h\n"
+    "ld1sb { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x455e18a5  // usublb z5.h, z5.b, z30.b\n"
+    ".inst 0x449a4452  // smlalt z18.s, p4/M, z2.h, z26.h\n"
+    ".inst 0x449a4574  // smlalt z20.s, p4/M, z11.h, z26.h\n"
+    "ld1b { z22.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x454a1339  // ssublb z25.h, z25.b, z10.b\n"
+    ".inst 0x449a4461  // smlalt z1.s, p4/M, z3.h, z26.h\n"
+    ".inst 0x448843a6  // smlalb z6.s, p4/M, z29.h, z8.h\n"
+    "ldr x20, [x17, #0x68]\n"
+    "ld1sb { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x44884069  // smlalb z9.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x44884087  // smlalb z7.s, p4/M, z4.h, z8.h\n"
+    ".inst 0x455e1ad6  // usublb z22.h, z22.b, z30.b\n"
+    "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x449047b7  // smlalt z23.s, p4/M, z29.h, z16.h\n"
+    ".inst 0x449543ee  // smlalb z14.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454a1042  // ssublb z2.h, z2.b, z10.b\n"
+    "ldr x20, [x17, #0x70]\n"
+    ".inst 0x448847b2  // smlalt z18.s, p4/M, z29.h, z8.h\n"
+    ".inst 0x44884474  // smlalt z20.s, p4/M, z3.h, z8.h\n"
+    "ld1sb { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
+    ".inst 0x455e1b5a  // usublb z26.h, z26.b, z30.b\n"
+    ".inst 0x44884481  // smlalt z1.s, p4/M, z4.h, z8.h\n"
+    ".inst 0x449043e6  // smlalb z6.s, p4/M, z31.h, z16.h\n"
+    "inch x4, ALL, MUL #8\n"
+    "ld1b { z8.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x44904089  // smlalb z9.s, p4/M, z4.h, z16.h\n"
+    ".inst 0x44904367  // smlalb z7.s, p4/M, z27.h, z16.h\n"
+    ".inst 0x454a13bd  // ssublb z29.h, z29.b, z10.b\n"
+    "ldr x20, [x17, #0x78]\n"
+    ".inst 0x449547f7  // smlalt z23.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x4491400e  // smlalb z14.s, p4/M, z0.h, z17.h\n"
+    "ld1sb { z24.h }, p4/Z, [x4]\n"
+    ".inst 0x455e1908  // usublb z8.h, z8.b, z30.b\n"
+    ".inst 0x449047f2  // smlalt z18.s, p4/M, z31.h, z16.h\n"
+    ".inst 0x44904494  // smlalt z20.s, p4/M, z4.h, z16.h\n"
+    "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x454a1318  // ssublb z24.h, z24.b, z10.b\n"
+    ".inst 0x44904761  // smlalt z1.s, p4/M, z27.h, z16.h\n"
+    ".inst 0x44954006  // smlalb z6.s, p4/M, z0.h, z21.h\n"
+    "ldr x22, [x17, #0x80]\n"
+    "ld1sb { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
+    ".inst 0x44954369  // smlalb z9.s, p4/M, z27.h, z21.h\n"
+    ".inst 0x449540a7  // smlalb z7.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x455e1bff  // usublb z31.h, z31.b, z30.b\n"
+    "ldr x21, [x17, #0x88]\n"
+    ".inst 0x44914417  // smlalt z23.s, p4/M, z0.h, z17.h\n"
+    ".inst 0x4499416e  // smlalb z14.s, p4/M, z11.h, z25.h\n"
+    ".inst 0x454a1210  // ssublb z16.h, z16.b, z10.b\n"
+    "ldr x20, [x17, #0x90]\n"
+    ".inst 0x44954412  // smlalt z18.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x44954774  // smlalt z20.s, p4/M, z27.h, z21.h\n"
+    "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1800  // usublb z0.h, z0.b, z30.b\n"
+    ".inst 0x449544a1  // smlalt z1.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x449142c6  // smlalb z6.s, p4/M, z22.h, z17.h\n"
+    "ld1sb { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    ".inst 0x449140a9  // smlalb z9.s, p4/M, z5.h, z17.h\n"
+    ".inst 0x44914267  // smlalb z7.s, p4/M, z19.h, z17.h\n"
+    "ldr x23, [x17, #0x98]\n"
+    "ldr x22, [x17, #0xa0]\n"
+    ".inst 0x44994577  // smlalt z23.s, p4/M, z11.h, z25.h\n"
+    ".inst 0x4482406e  // smlalb z14.s, p4/M, z3.h, z2.h\n"
+    "ld1b { z11.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x455e196b  // usublb z11.h, z11.b, z30.b\n"
+    ".inst 0x449146d2  // smlalt z18.s, p4/M, z22.h, z17.h\n"
+    ".inst 0x449144b4  // smlalt z20.s, p4/M, z5.h, z17.h\n"
+    "ld1sb { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
+    ".inst 0x454a12d6  // ssublb z22.h, z22.b, z10.b\n"
+    ".inst 0x44914661  // smlalt z1.s, p4/M, z19.h, z17.h\n"
+    ".inst 0x44994066  // smlalb z6.s, p4/M, z3.h, z25.h\n"
+    "ld1b { z17.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1a31  // usublb z17.h, z17.b, z30.b\n"
+    ".inst 0x44994389  // smlalb z9.s, p4/M, z28.h, z25.h\n"
+    ".inst 0x44994347  // smlalb z7.s, p4/M, z26.h, z25.h\n"
+    "ldr x20, [x17, #0xa8]\n"
+    "ldr x21, [x17, #0xb0]\n"
+    ".inst 0x44824477  // smlalt z23.s, p4/M, z3.h, z2.h\n"
+    ".inst 0x449d408e  // smlalb z14.s, p4/M, z4.h, z29.h\n"
+    "ldr x13, [x17, #0xb8]\n"
+    "ldr x12, [x17, #0xc0]\n"
+    ".inst 0x44994472  // smlalt z18.s, p4/M, z3.h, z25.h\n"
+    ".inst 0x44994794  // smlalt z20.s, p4/M, z28.h, z25.h\n"
+    "ld1b { z3.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x455e1863  // usublb z3.h, z3.b, z30.b\n"
+    ".inst 0x44994741  // smlalt z1.s, p4/M, z26.h, z25.h\n"
+    ".inst 0x44824086  // smlalb z6.s, p4/M, z4.h, z2.h\n"
+    "ld1sb { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
+    ".inst 0x454a1339  // ssublb z25.h, z25.b, z10.b\n"
+    ".inst 0x44824349  // smlalb z9.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x44824107  // smlalb z7.s, p4/M, z8.h, z2.h\n"
+    "ldr x11, [x17, #0xc8]\n"
+    "ldr x10, [x17, #0xd0]\n"
+    ".inst 0x449d4497  // smlalt z23.s, p4/M, z4.h, z29.h\n"
+    ".inst 0x4498436e  // smlalb z14.s, p4/M, z27.h, z24.h\n"
+    "ldr x9, [x17, #0xd8]\n"
+    "ldr x28, [x17, #0xe0]\n"
+    ".inst 0x44824492  // smlalt z18.s, p4/M, z4.h, z2.h\n"
+    ".inst 0x44824754  // smlalt z20.s, p4/M, z26.h, z2.h\n"
+    "ld1b { z4.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1884  // usublb z4.h, z4.b, z30.b\n"
+    ".inst 0x44824501  // smlalt z1.s, p4/M, z8.h, z2.h\n"
+    ".inst 0x449d4366  // smlalb z6.s, p4/M, z27.h, z29.h\n"
+    "ld1sb { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x454a1042  // ssublb z2.h, z2.b, z10.b\n"
+    ".inst 0x449d4109  // smlalb z9.s, p4/M, z8.h, z29.h\n"
+    ".inst 0x449d43e7  // smlalb z7.s, p4/M, z31.h, z29.h\n"
+    "ldr x27, [x17, #0xe8]\n"
+    "ldr x26, [x17, #0xf0]\n"
+    ".inst 0x44984777  // smlalt z23.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x449040ae  // smlalb z14.s, p4/M, z5.h, z16.h\n"
+    "ldr x25, [x17, #0xf8]\n"
+    "ldr x24, [x17, #0x100]\n"
+    ".inst 0x449d4772  // smlalt z18.s, p4/M, z27.h, z29.h\n"
+    ".inst 0x449d4514  // smlalt z20.s, p4/M, z8.h, z29.h\n"
+    "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1b7b  // usublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449d47e1  // smlalt z1.s, p4/M, z31.h, z29.h\n"
+    ".inst 0x449840a6  // smlalb z6.s, p4/M, z5.h, z24.h\n"
+    "ld1sb { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x454a13bd  // ssublb z29.h, z29.b, z10.b\n"
+    ".inst 0x449843e9  // smlalb z9.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984007  // smlalb z7.s, p4/M, z0.h, z24.h\n"
+    "ldr x23, [x17, #0x108]\n"
+    "ldr x22, [x17, #0x110]\n"
+    ".inst 0x449044b7  // smlalt z23.s, p4/M, z5.h, z16.h\n"
+    ".inst 0x4495438e  // smlalb z14.s, p4/M, z28.h, z21.h\n"
+    "ldr x20, [x17, #0x118]\n"
+    "whilelt p0.h, x16, x3\n"
+    ".inst 0x449844b2  // smlalt z18.s, p4/M, z5.h, z24.h\n"
+    ".inst 0x449847f4  // smlalt z20.s, p4/M, z31.h, z24.h\n"
+    "ld1b { z5.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x455e18a5  // usublb z5.h, z5.b, z30.b\n"
+    ".inst 0x44984401  // smlalt z1.s, p4/M, z0.h, z24.h\n"
+    ".inst 0x44904266  // smlalb z6.s, p4/M, z19.h, z16.h\n"
+    "ld1sb { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
+    "inch x4, ALL, MUL #8\n"
+    ".inst 0x44904009  // smlalb z9.s, p4/M, z0.h, z16.h\n"
+    ".inst 0x44904167  // smlalb z7.s, p4/M, z11.h, z16.h\n"
+    ".inst 0x454a1318  // ssublb z24.h, z24.b, z10.b\n"
+    "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44954797  // smlalt z23.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x4496434e  // smlalb z14.s, p4/M, z26.h, z22.h\n"
+    "ld1b { z28.h }, p3/Z, [x13, x2]\n"
+    ".inst 0x455e1b9c  // usublb z28.h, z28.b, z30.b\n"
+    ".inst 0x44904672  // smlalt z18.s, p4/M, z19.h, z16.h\n"
+    ".inst 0x44904414  // smlalt z20.s, p4/M, z0.h, z16.h\n"
+    "ld1sb { z19.h }, p4/Z, [x4]\n"
+    ".inst 0x454a1273  // ssublb z19.h, z19.b, z10.b\n"
+    ".inst 0x44904561  // smlalt z1.s, p4/M, z11.h, z16.h\n"
+    ".inst 0x44954346  // smlalb z6.s, p4/M, z26.h, z21.h\n"
+    "ld1b { z16.h }, p3/Z, [x12, x2]\n"
+    ".inst 0x455e1a10  // usublb z16.h, z16.b, z30.b\n"
+    ".inst 0x44954229  // smlalb z9.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x44954067  // smlalb z7.s, p4/M, z3.h, z21.h\n"
+    ".inst 0x44964757  // smlalt z23.s, p4/M, z26.h, z22.h\n"
+    ".inst 0x4499410e  // smlalb z14.s, p4/M, z8.h, z25.h\n"
+    ".inst 0x44954752  // smlalt z18.s, p4/M, z26.h, z21.h\n"
+    ".inst 0x44954634  // smlalt z20.s, p4/M, z17.h, z21.h\n"
+    "ld1b { z26.h }, p3/Z, [x11, x2]\n"
+    ".inst 0x455e1b5a  // usublb z26.h, z26.b, z30.b\n"
+    ".inst 0x44954461  // smlalt z1.s, p4/M, z3.h, z21.h\n"
+    ".inst 0x44964106  // smlalb z6.s, p4/M, z8.h, z22.h\n"
+    "ld1sb { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    ".inst 0x44964069  // smlalb z9.s, p4/M, z3.h, z22.h\n"
+    ".inst 0x44964087  // smlalb z7.s, p4/M, z4.h, z22.h\n"
+    ".inst 0x44994517  // smlalt z23.s, p4/M, z8.h, z25.h\n"
+    ".inst 0x448243ee  // smlalb z14.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x44964512  // smlalt z18.s, p4/M, z8.h, z22.h\n"
+    ".inst 0x44964474  // smlalt z20.s, p4/M, z3.h, z22.h\n"
+    "ld1b { z8.h }, p3/Z, [x10, x2]\n"
+    ".inst 0x455e1908  // usublb z8.h, z8.b, z30.b\n"
+    ".inst 0x44964481  // smlalt z1.s, p4/M, z4.h, z22.h\n"
+    ".inst 0x449943e6  // smlalb z6.s, p4/M, z31.h, z25.h\n"
+    "ld1sb { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
+    ".inst 0x454a12d6  // ssublb z22.h, z22.b, z10.b\n"
+    ".inst 0x44994089  // smlalb z9.s, p4/M, z4.h, z25.h\n"
+    ".inst 0x44994367  // smlalb z7.s, p4/M, z27.h, z25.h\n"
+    ".inst 0x448247f7  // smlalt z23.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x449d400e  // smlalb z14.s, p4/M, z0.h, z29.h\n"
+    ".inst 0x449947f2  // smlalt z18.s, p4/M, z31.h, z25.h\n"
+    ".inst 0x44994494  // smlalt z20.s, p4/M, z4.h, z25.h\n"
+    "ld1b { z31.h }, p3/Z, [x9, x2]\n"
+    ".inst 0x455e1bff  // usublb z31.h, z31.b, z30.b\n"
+    ".inst 0x44994761  // smlalt z1.s, p4/M, z27.h, z25.h\n"
+    ".inst 0x44824006  // smlalb z6.s, p4/M, z0.h, z2.h\n"
+    "ld1sb { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
+    ".inst 0x454a1339  // ssublb z25.h, z25.b, z10.b\n"
+    ".inst 0x44824369  // smlalb z9.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x448240a7  // smlalb z7.s, p4/M, z5.h, z2.h\n"
+    ".inst 0x449d4417  // smlalt z23.s, p4/M, z0.h, z29.h\n"
+    ".inst 0x4498422e  // smlalb z14.s, p4/M, z17.h, z24.h\n"
+    ".inst 0x44824412  // smlalt z18.s, p4/M, z0.h, z2.h\n"
+    ".inst 0x44824774  // smlalt z20.s, p4/M, z27.h, z2.h\n"
+    "ld1b { z0.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x455e1800  // usublb z0.h, z0.b, z30.b\n"
+    ".inst 0x448244a1  // smlalt z1.s, p4/M, z5.h, z2.h\n"
+    ".inst 0x449d4166  // smlalb z6.s, p4/M, z11.h, z29.h\n"
+    "ld1sb { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
+    ".inst 0x454a1042  // ssublb z2.h, z2.b, z10.b\n"
+    ".inst 0x449d40a9  // smlalb z9.s, p4/M, z5.h, z29.h\n"
+    ".inst 0x449d4387  // smlalb z7.s, p4/M, z28.h, z29.h\n"
+    ".inst 0x44984637  // smlalt z23.s, p4/M, z17.h, z24.h\n"
+    ".inst 0x4493406e  // smlalb z14.s, p4/M, z3.h, z19.h\n"
+    "ld1b { z17.h }, p3/Z, [x27, x2]\n"
+    ".inst 0x455e1a31  // usublb z17.h, z17.b, z30.b\n"
+    ".inst 0x449d4572  // smlalt z18.s, p4/M, z11.h, z29.h\n"
+    ".inst 0x449d44b4  // smlalt z20.s, p4/M, z5.h, z29.h\n"
+    "ld1sb { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x454a116b  // ssublb z11.h, z11.b, z10.b\n"
+    ".inst 0x449d4781  // smlalt z1.s, p4/M, z28.h, z29.h\n"
+    ".inst 0x44984066  // smlalb z6.s, p4/M, z3.h, z24.h\n"
+    "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x455e1bbd  // usublb z29.h, z29.b, z30.b\n"
+    ".inst 0x44984209  // smlalb z9.s, p4/M, z16.h, z24.h\n"
+    ".inst 0x44984347  // smlalb z7.s, p4/M, z26.h, z24.h\n"
+    ".inst 0x44934477  // smlalt z23.s, p4/M, z3.h, z19.h\n"
+    ".inst 0x4495408e  // smlalb z14.s, p4/M, z4.h, z21.h\n"
+    ".inst 0x44984472  // smlalt z18.s, p4/M, z3.h, z24.h\n"
+    ".inst 0x44984614  // smlalt z20.s, p4/M, z16.h, z24.h\n"
+    "ld1b { z3.h }, p3/Z, [x25, x2]\n"
+    ".inst 0x455e1863  // usublb z3.h, z3.b, z30.b\n"
+    ".inst 0x44984741  // smlalt z1.s, p4/M, z26.h, z24.h\n"
+    ".inst 0x44934086  // smlalb z6.s, p4/M, z4.h, z19.h\n"
+    "ld1sb { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x454a1318  // ssublb z24.h, z24.b, z10.b\n"
+    ".inst 0x44934349  // smlalb z9.s, p4/M, z26.h, z19.h\n"
+    ".inst 0x44934107  // smlalb z7.s, p4/M, z8.h, z19.h\n"
+    ".inst 0x44954497  // smlalt z23.s, p4/M, z4.h, z21.h\n"
+    ".inst 0x4496436e  // smlalb z14.s, p4/M, z27.h, z22.h\n"
+    ".inst 0x44934492  // smlalt z18.s, p4/M, z4.h, z19.h\n"
+    ".inst 0x44934754  // smlalt z20.s, p4/M, z26.h, z19.h\n"
+    "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x455e1884  // usublb z4.h, z4.b, z30.b\n"
+    ".inst 0x44934501  // smlalt z1.s, p4/M, z8.h, z19.h\n"
+    ".inst 0x44954366  // smlalb z6.s, p4/M, z27.h, z21.h\n"
+    "ld1sb { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+    "inch x4, ALL, MUL #8\n"
+    ".inst 0x44954109  // smlalb z9.s, p4/M, z8.h, z21.h\n"
+    ".inst 0x449543e7  // smlalb z7.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454a1273  // ssublb z19.h, z19.b, z10.b\n"
+    ".inst 0x44964777  // smlalt z23.s, p4/M, z27.h, z22.h\n"
+    ".inst 0x449940ae  // smlalb z14.s, p4/M, z5.h, z25.h\n"
+    ".inst 0x44954772  // smlalt z18.s, p4/M, z27.h, z21.h\n"
+    ".inst 0x44954514  // smlalt z20.s, p4/M, z8.h, z21.h\n"
+    "ld1b { z27.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x455e1b7b  // usublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449547e1  // smlalt z1.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x449640a6  // smlalb z6.s, p4/M, z5.h, z22.h\n"
+    "ld1sb { z21.h }, p4/Z, [x4]\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    ".inst 0x449643e9  // smlalb z9.s, p4/M, z31.h, z22.h\n"
+    ".inst 0x44964007  // smlalb z7.s, p4/M, z0.h, z22.h\n"
+    "inch x4\n"
+    ".inst 0x449944b7  // smlalt z23.s, p4/M, z5.h, z25.h\n"
+    ".inst 0x4482420e  // smlalb z14.s, p4/M, z16.h, z2.h\n"
+    ".inst 0x449644b2  // smlalt z18.s, p4/M, z5.h, z22.h\n"
+    ".inst 0x449647f4  // smlalt z20.s, p4/M, z31.h, z22.h\n"
+    "ld1b { z5.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e18a5  // usublb z5.h, z5.b, z30.b\n"
+    ".inst 0x44964401  // smlalt z1.s, p4/M, z0.h, z22.h\n"
+    ".inst 0x44994386  // smlalb z6.s, p4/M, z28.h, z25.h\n"
+    "ld1w { z22.s }, p2/Z, [x15]\n"
+    ".inst 0x44994009  // smlalb z9.s, p4/M, z0.h, z25.h\n"
+    ".inst 0x44994227  // smlalb z7.s, p4/M, z17.h, z25.h\n"
+    ".inst 0x44824617  // smlalt z23.s, p4/M, z16.h, z2.h\n"
+    ".inst 0x448b434e  // smlalb z14.s, p4/M, z26.h, z11.h\n"
+    "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
+    "addvl x15, x15, #2\n"
+    ".inst 0x44994792  // smlalt z18.s, p4/M, z28.h, z25.h\n"
+    ".inst 0x44994414  // smlalt z20.s, p4/M, z0.h, z25.h\n"
+    "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1b9c  // usublb z28.h, z28.b, z30.b\n"
+    ".inst 0x44994621  // smlalt z1.s, p4/M, z17.h, z25.h\n"
+    ".inst 0x44824346  // smlalb z6.s, p4/M, z26.h, z2.h\n"
+    "uzp1 z25.s, z22.s, z16.s\n"
+    "inch x2\n"
+    ".inst 0x448243a9  // smlalb z9.s, p4/M, z29.h, z2.h\n"
+    ".inst 0x44824067  // smlalb z7.s, p4/M, z3.h, z2.h\n"
+    "uzp2 z16.s, z22.s, z16.s\n"
+    "ld1w { z22.s }, p2/Z, [x14]\n"
+    ".inst 0x448b4757  // smlalt z23.s, p4/M, z26.h, z11.h\n"
+    ".inst 0x4498410e  // smlalb z14.s, p4/M, z8.h, z24.h\n"
+    "mov x20, x2\n"
+    "incw x20\n"
+    ".inst 0x44824752  // smlalt z18.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x448247b4  // smlalt z20.s, p4/M, z29.h, z2.h\n"
+    "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
+    "uzp1 z29.s, z22.s, z26.s\n"
+    ".inst 0x44824461  // smlalt z1.s, p4/M, z3.h, z2.h\n"
+    ".inst 0x448b4106  // smlalb z6.s, p4/M, z8.h, z11.h\n"
+    "uzp2 z22.s, z22.s, z26.s\n"
+    "whilelt p2.s, x2, x3\n"
+    ".inst 0x448b4069  // smlalb z9.s, p4/M, z3.h, z11.h\n"
+    ".inst 0x448b4087  // smlalb z7.s, p4/M, z4.h, z11.h\n"
+    "whilelt p1.s, x20, x3\n"
+    "whilelt p3.h, x2, x3\n"
+    ".inst 0x44984517  // smlalt z23.s, p4/M, z8.h, z24.h\n"
+    ".inst 0x449343ee  // smlalb z14.s, p4/M, z31.h, z19.h\n"
+    "addvl x14, x14, #2\n"
+    ".inst 0x448b4512  // smlalt z18.s, p4/M, z8.h, z11.h\n"
+    ".inst 0x448b4474  // smlalt z20.s, p4/M, z3.h, z11.h\n"
+    ".inst 0x448b4481  // smlalt z1.s, p4/M, z4.h, z11.h\n"
+    ".inst 0x449843e6  // smlalb z6.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984089  // smlalb z9.s, p4/M, z4.h, z24.h\n"
+    ".inst 0x44984367  // smlalb z7.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x449347f7  // smlalt z23.s, p4/M, z31.h, z19.h\n"
+    ".inst 0x4495400e  // smlalb z14.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x04b975ce  // sqrdmulh z14.s, z14.s, z25.s\n"
+    ".inst 0x449847f2  // smlalt z18.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984494  // smlalt z20.s, p4/M, z4.h, z24.h\n"
+    "and z3.d, z14.d, z29.d\n"
+    ".inst 0x44984761  // smlalt z1.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x44934006  // smlalb z6.s, p4/M, z0.h, z19.h\n"
+    "asr z3.s, z3.s, #0x1f\n"
+    ".inst 0x44934369  // smlalb z9.s, p4/M, z27.h, z19.h\n"
+    ".inst 0x449340a7  // smlalb z7.s, p4/M, z5.h, z19.h\n"
+    "sqadd z14.s, z14.s, z3.s\n"
+    ".inst 0x448293ae  // srshl z14.s, p4/M, z14.s, z29.s\n"
+    ".inst 0x44954417  // smlalt z23.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x44934412  // smlalt z18.s, p4/M, z0.h, z19.h\n"
+    ".inst 0x04b076f7  // sqrdmulh z23.s, z23.s, z16.s\n"
+    ".inst 0x44934774  // smlalt z20.s, p4/M, z27.h, z19.h\n"
+    ".inst 0x449344a1  // smlalt z1.s, p4/M, z5.h, z19.h\n"
+    "and z31.d, z23.d, z22.d\n"
+    ".inst 0x44954226  // smlalb z6.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x449540a9  // smlalb z9.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x04b974c6  // sqrdmulh z6.s, z6.s, z25.s\n"
+    ".inst 0x44954387  // smlalb z7.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x44954632  // smlalt z18.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x04b97529  // sqrdmulh z9.s, z9.s, z25.s\n"
+    ".inst 0x449544b4  // smlalt z20.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x44954781  // smlalt z1.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x04b974e7  // sqrdmulh z7.s, z7.s, z25.s\n"
+    "asr z31.s, z31.s, #0x1f\n"
+    "and z3.d, z6.d, z29.d\n"
+    ".inst 0x04b07652  // sqrdmulh z18.s, z18.s, z16.s\n"
+    "and z0.d, z9.d, z29.d\n"
+    ".inst 0x04b07694  // sqrdmulh z20.s, z20.s, z16.s\n"
+    "and z19.d, z7.d, z29.d\n"
+    ".inst 0x04b07421  // sqrdmulh z1.s, z1.s, z16.s\n"
+    "sqadd z23.s, z23.s, z31.s\n"
+    ".inst 0x448292d7  // srshl z23.s, p4/M, z23.s, z22.s\n"
+    "asr z3.s, z3.s, #0x1f\n"
+    "and z21.d, z18.d, z22.d\n"
+    "asr z0.s, z0.s, #0x1f\n"
+    "and z17.d, z20.d, z22.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "and z16.d, z1.d, z22.d\n"
+    "sqadd z6.s, z6.s, z3.s\n"
+    "asr z21.s, z21.s, #0x1f\n"
+    ".inst 0x448293a6  // srshl z6.s, p4/M, z6.s, z29.s\n"
+    "sqadd z9.s, z9.s, z0.s\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x448293a9  // srshl z9.s, p4/M, z9.s, z29.s\n"
+    "sqadd z7.s, z7.s, z19.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x448293a7  // srshl z7.s, p4/M, z7.s, z29.s\n"
+    "sqadd z18.s, z18.s, z21.s\n"
+    "sqadd z20.s, z20.s, z17.s\n"
+    ".inst 0x448292d2  // srshl z18.s, p4/M, z18.s, z22.s\n"
+    ".inst 0x448292d4  // srshl z20.s, p4/M, z20.s, z22.s\n"
+    "sqadd z1.s, z1.s, z16.s\n"
+    ".inst 0x453041ce  // sqxtnb z14.h, z14.s\n"
+    ".inst 0x448292c1  // srshl z1.s, p4/M, z1.s, z22.s\n"
+    ".inst 0x453040c6  // sqxtnb z6.h, z6.s\n"
+    ".inst 0x45304129  // sqxtnb z9.h, z9.s\n"
+    ".inst 0x453040e7  // sqxtnb z7.h, z7.s\n"
+    ".inst 0x453046ee  // sqxtnt z14.h, z23.s\n"
+    ".inst 0x45304646  // sqxtnt z6.h, z18.s\n"
+    ".inst 0x45304689  // sqxtnt z9.h, z20.s\n"
+    ".inst 0x45304427  // sqxtnt z7.h, z1.s\n"
+    "sqadd z14.h, z14.h, z15.h\n"
+    "smax z14.h, p4/M, z14.h, z12.h\n"
+    "smin z14.h, p4/M, z14.h, z13.h\n"
+    "sqadd z6.h, z6.h, z15.h\n"
+    "sqadd z9.h, z9.h, z15.h\n"
+    "smax z6.h, p4/M, z6.h, z12.h\n"
+    "smax z9.h, p4/M, z9.h, z12.h\n"
+    "sqadd z7.h, z7.h, z15.h\n"
+    "smax z7.h, p4/M, z7.h, z12.h\n"
+    "smin z6.h, p4/M, z6.h, z13.h\n"
+    "st1b { z14.h }, p0, [x5, x16]\n"
+    "smin z9.h, p4/M, z9.h, z13.h\n"
+    "smin z7.h, p4/M, z7.h, z13.h\n"
+    "st1b { z6.h }, p0, [x6, x16]\n"
+    "st1b { z9.h }, p0, [x7, x16]\n"
+    "st1b { z7.h }, p0, [x8, x16]\n"
+    "ld1w { z17.s }, p2/Z, [x21]\n"
+    "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+    "uzp1 z14.s, z17.s, z16.s\n"
+    "ld1sb { z26.h }, p4/Z, [x4]\n"
+    "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+    "uzp2 z23.s, z17.s, z16.s\n"
+    "addvl x21, x21, #2\n"
+    "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+    "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+    "inch x16\n"
+    "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+    "ldp x9, x28, [x17, #0x0]\n"
+    "mov z6.d, z14.d\n"
+    "mov z18.d, z23.d\n"
+    "ldp x27, x26, [x17, #0x10]\n"
+    "ldp x25, x24, [x17, #0x20]\n"
+    "mov z9.d, z14.d\n"
+    "mov z20.d, z23.d\n"
+    "ldp x23, x22, [x17, #0x30]\n"
+    "ldp x21, x20, [x17, #0x40]\n"
+    "mov z7.d, z14.d\n"
+    "mov z1.d, z23.d\n"
+    "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+    "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x454a135a  // ssublb z26.h, z26.b, z10.b\n"
+    ".inst 0x454a1108  // ssublb z8.h, z8.b, z10.b\n"
+    "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+    "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x454a1210  // ssublb z16.h, z16.b, z10.b\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+    "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x454a1231  // ssublb z17.h, z17.b, z10.b\n"
+    ".inst 0x455e1ad6  // usublb z22.h, z22.b, z30.b\n"
+    "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+    "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1842  // usublb z2.h, z2.b, z30.b\n"
+    ".inst 0x455e196b  // usublb z11.h, z11.b, z30.b\n"
+    "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+    "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1863  // usublb z3.h, z3.b, z30.b\n"
+    ".inst 0x455e1bbd  // usublb z29.h, z29.b, z30.b\n"
+    ".inst 0x455e1884  // usublb z4.h, z4.b, z30.b\n"
+    ".inst 0x455e1bff  // usublb z31.h, z31.b, z30.b\n"
+    ".inst 0x455e1800  // usublb z0.h, z0.b, z30.b\n"
+    ".inst 0x455e1a73  // usublb z19.h, z19.b, z30.b\n"
+    ".inst 0x455e1b9c  // usublb z28.h, z28.b, z30.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/premultiply.cpp b/src/core/NEON/kernels/arm_conv/depthwise/premultiply.cpp
new file mode 100644
index 0000000000..8a49c775d3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/premultiply.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <premultiply.hpp>
+
+#define CHANNEL_MULTIPLIER 6
+#define BLOCK_SIZE 4
+
+void do_premultiply_float_6(const float       *in_ptr,
+                            const unsigned int ld_row,
+                            const unsigned int ld_col,
+                            float             *out_ptr,
+                            const unsigned int out_ld_row,
+                            const unsigned int out_ld_col,
+                            const unsigned int tile_rows,
+                            const unsigned int tile_cols,
+                            const unsigned     input_channels)
+{
+    for(unsigned int i = 0; i < tile_rows; i++)
+    {
+        const float *ip2 = in_ptr + i * ld_row;
+        float       *op2 = out_ptr + i * out_ld_row;
+        for(unsigned int j = 0; j < tile_cols; j++)
+        {
+            const float *ip = ip2;
+            float       *op = op2;
+
+            unsigned int num_blocks = input_channels / BLOCK_SIZE;
+            for(unsigned int c = 0; c < num_blocks; c++)
+            {
+                float vals[BLOCK_SIZE];
+                for(unsigned int v = 0; v < BLOCK_SIZE; v++)
+                {
+                    vals[v] = ip[v];
+                }
+                ip += BLOCK_SIZE;
+
+                for(unsigned int v = 0; v < BLOCK_SIZE; v++)
+                {
+                    for(unsigned int r = 0; r < CHANNEL_MULTIPLIER; r++)
+                    {
+                        op[r] = vals[v];
+                    }
+                    op += CHANNEL_MULTIPLIER;
+                }
+            }
+
+            unsigned int rem = input_channels - num_blocks * BLOCK_SIZE;
+            for(unsigned int c = 0; c < rem; c++)
+            {
+                float val = ip[c];
+                for(unsigned int r = 0; r < CHANNEL_MULTIPLIER; r++)
+                {
+                    op[r] = val;
+                }
+                op += CHANNEL_MULTIPLIER;
+            }
+
+            ip2 += ld_col;
+            op2 += out_ld_col;
+        }
+    }
+}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp b/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp
new file mode 100644
index 0000000000..9805fd354f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Depthwise kernel drivers commonly require a per-thread blob of working space
+ * in which to store parameters required by the depthwise implementations. The
+ * composition of this working space varies with the driver, kernel, and data
+ * types -- but the tasks of requesting sufficient space, allocating buffer
+ * space, and performing initialisation of the working space are common.
+ *
+ * The classes in this file consist of a number of working space "Elements"
+ * (which are logical units of functionality) and a Workspace type which allows
+ * for compile time composition of elements into a single working space type.
+ *
+ * Creating a workspace
+ * ====================
+ *
+ * A new workspace type can be created by combining Elements as an argument to
+ * the Workspace class. For instance:
+ *
+ *   Workspace<
+ *     depthwise_depthfirst::InputArrayElement<float>,
+ *     InputBufferElement<float>,
+ *     OutputArrayElement<float>
+ *   >
+ *
+ * Creates a new Workspace consisting of the given elements. The workspace type
+ * contained within this class (`Workspace<...>::WorkspaceType`) is equivalent to:
+ *
+ *   struct WorkspaceType
+ *   {
+ *     const float **inptr_array;  // From InputArrayElement<float>
+ *     float *input_buffer;  // From InputBufferElement<float>
+ *     float **outptr_array;  // From OutputArrayElement<float>
+ *     float *output_buffer;  // From OutputArrayElement<float>
+ *   };
+ *
+ * Calling `Workspace<...>::get_sizeof_workspace(...)` will return the amount
+ * of space required to store the above struct and the elements contained
+ * within it. Once this space has been allocated, the workspace can be
+ * initialised by calling `Workspace<...>::initialise` with a pointer to the
+ * buffer and the same arguments. This will place a struct of type
+ * `Workspace<...>::WorkspaceType` at the start of the buffer, and share the
+ * remaining space between the specified elements. As this is all done at
+ * compile time, later code can access elements from the `WorkspaceType` by
+ * name.
+ *
+ * Writing a new element
+ * =====================
+ *
+ * Each Element must provide:
+ *  - A struct called "Workspace" containing the variables contained within
+ *    this portion of the workspace.
+ *  - A static method called `get_element_size` which returns the amount of
+ *    buffer space required by this element of the workspace (NOT including the
+ *    size of the Workspace struct). For example, an element which stores a
+ *    vector of pointers will return the amount of space required top store the
+ *    vector.
+ *  - A static method called `initialise` which accepts a pointer to a struct
+ *    which will be composed of the Element's `Workspace` struct (along with
+ *    other elements), a pointer to the start of the buffer allocated for this
+ *    portion of the workspace, and arguments to be used to initialise the
+ *    workspace. The Element should consume as much of the buffer as it
+ *    requires, initialise the Workspace, and then return the pointer to the
+ *    next free byte of the buffer.
+ *
+ * See the below elements for an example of how this should work.
+ */
+
+#pragma once
+
+#include "depthwise.hpp"
+#include "depthfirst_driver.hpp"
+#include "utils.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+namespace {  // anonymous because we expect this to appear in several compilation units
+
+/* Arguments to use to size and initialise a workspace.
+ */
+template <class StratType, class OutputStage=Nothing>
+struct WorkspaceArgs
+{
+  const StratType *strategy;
+  const DepthwiseArgs &depthwise_args;
+  const OutputStage &output_stage;
+
+  WorkspaceArgs(const StratType *strat, const DepthwiseArgs &dwargs, const OutputStage &os = {})
+  : strategy(strat), depthwise_args(dwargs), output_stage(os)
+  {
+  }
+};
+
+
+/* Sometimes we use templated structs to fill in workspace types, the Empty
+ * element can be useful for when a blank element is required for some sets of
+ * parameters.
+ */
+struct EmptyElement
+{
+  struct Workspace {};
+
+  template <class StratType, class OutputStage>
+  static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &) { return 0; }
+
+  template <class WorkspaceType, class StratType, class OutputStage>
+  static void *initialise(WorkspaceType *, void *buffer, const WorkspaceArgs<StratType, OutputStage> &)
+  {
+    return buffer;
+  }
+};
+
+
+/* Store fused activations for a kernel.
+ *
+ * Activations are set based on the DepthwiseArgs.
+ */
+template <typename T, class OutputStage=Nothing>
+class ActivationsElement
+{
+  public:
+  struct Workspace
+  {
+    T activation_min, activation_max;
+  };
+
+  template <typename StratType>
+  static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &)
+  {
+    return 0;
+  }
+
+  template <class WorkspaceType, class StratType>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
+  {
+    ws->activation_min = static_cast<T>(-std::numeric_limits<float>::infinity());
+    ws->activation_max = static_cast<T>(std::numeric_limits<float>::infinity());
+
+    switch (args.depthwise_args.activation.type)
+    {
+      case arm_gemm::Activation::Type::BoundedReLU:
+        ws->activation_max = static_cast<T>(args.depthwise_args.activation.param1);
+        // Fall through
+      case arm_gemm::Activation::Type::ReLU:
+        ws->activation_min = static_cast<T>(0);
+        break;
+      default:
+        break;
+    }
+
+    return buffer;
+  }
+};
+
+/* Activation clamps are contained within `arm_gemm::Requantize32`, so if the
+ * output stage is one of these we substitute in an empty workspace element.
+ */
+template <typename T>
+class ActivationsElement<T, arm_gemm::Requantize32> : public EmptyElement
+{
+};
+
+
+/* Get the value with which to fill an input buffer. This defaults to `0`
+ * (which we return as a `char` since it gets used by `memset`).
+ */
+template <typename OutputStage>
+char get_input_buffer_fill_value(const OutputStage &)
+{
+  return 0;
+}
+
+/* In the case of kernels operating on quantized data, we need to fill the
+ * input buffer with the zero offset of the input tensor.
+ */
+template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp) __attribute__ ((unused));
+template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp)
+{
+  return qp.a_offset;
+}
+
+
+/* Container for a vector of padding values which can be safely consumed by the
+ * depthwise kernel. The padding values are initialised to either `0` or the
+ * zero offset of the input tensor (if quantized).
+ */
+template <typename T>
+class InputBufferElement
+{
+  public:
+  struct Workspace
+  {
+    T *input_buffer;
+  };
+
+  template <typename StratType, typename OutputStage>
+  static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &args)
+  {
+    return sizeof(T) * args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+  }
+
+  template <class WorkspaceType, typename StratType, typename OutputStage>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
+  {
+    ws->input_buffer = reinterpret_cast<T*>(buffer);
+    memset(ws->input_buffer, get_input_buffer_fill_value(args.output_stage), get_element_size(args));
+    return reinterpret_cast<char *>(buffer) + get_element_size(args);
+  }
+};
+
+
+/* Container for an array of output pointers, and a buffer which can be used as
+ * a destination for unnecessary writes.
+ */
+template <typename T>
+class OutputArrayElement
+{
+  public:
+  struct Workspace
+  {
+    T **outptr_array;
+    T *output_buffer;
+  };
+
+  template <typename OutputStage>
+  static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    return sizeof_outptr_array(args) + sizeof_output_buffer(args);
+  }
+
+  template <class WorkspaceType, typename OutputStage>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    char *buffer_bytes = reinterpret_cast<char *>(buffer);
+
+    ws->outptr_array = reinterpret_cast<T **>(buffer_bytes);
+    buffer_bytes += sizeof_outptr_array(args);
+
+    ws->output_buffer = reinterpret_cast<T *>(buffer_bytes);
+    buffer_bytes += sizeof_output_buffer(args);
+
+    return buffer_bytes;
+  }
+
+  protected:
+  template <typename OutputStage>
+  static size_t sizeof_outptr_array(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    return sizeof(T **) * args.strategy->get_output_rows() * args.strategy->get_output_cols();
+  }
+
+  template <typename OutputStage>
+  static size_t sizeof_output_buffer(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    return sizeof(T) * args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+  }
+};
+
+
+/* Intermediate array to store results of premultiplication.
+ * Used as input to the kernel instead of the original input array.
+ */
+template <typename T>
+class IntermediateBufferElement
+{
+public:
+    struct Workspace
+    {
+        T *intermediate_buffer;
+    };
+
+    template <typename StratType, typename OutputStage>
+    static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &args)
+    {
+      auto cols = args.depthwise_args.input_cols + args.depthwise_args.kernel_cols;
+      auto rows = args.strategy->get_input_rows() + args.depthwise_args.kernel_rows;
+      auto channels = args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+      return sizeof(T) * cols * rows * channels;
+    }
+
+    template <class WorkspaceType, typename StratType, typename OutputStage>
+    static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
+    {
+      ws->intermediate_buffer = reinterpret_cast<T*>(buffer);
+      return reinterpret_cast<char *>(buffer) + get_element_size(args);
+    }
+};
+
+
+/* Container for requantization parameters.
+ *
+ * This removes the distinction between per-layer and per-channel
+ * requantization parameters by providing a vector of requantization parameters
+ * regardless of whether per-layer or per-channel is selected.
+ */
+class RequantizationParametersElement
+{
+  public:
+  struct Workspace
+  {
+    const int32_t *bias, *requant_muls, *requant_shifts;
+  };
+
+  template <typename StratType>
+  static size_t get_element_size(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+  {
+    return sizeof_bias(args) + sizeof_requant_muls(args) + sizeof_requant_shifts(args);
+  }
+
+  template <typename WorkspaceType, typename StratType>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+  {
+    const auto n_output_channels = args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+    char *buffer_bytes = reinterpret_cast<char *>(buffer);
+
+    ws->bias = args.output_stage.bias;
+    ws->requant_muls = args.output_stage.per_channel_muls;
+    ws->requant_shifts = args.output_stage.per_channel_right_shifts;
+
+    if (ws->bias == nullptr)
+    {
+      ws->bias = reinterpret_cast<const int32_t *>(buffer_bytes);
+      memset(buffer_bytes, 0, sizeof_bias(args));
+      buffer_bytes += sizeof_bias(args);
+    }
+
+    if (ws->requant_muls == nullptr)
+    {
+      ws->requant_muls = reinterpret_cast<const int32_t *>(buffer_bytes);
+      auto muls = reinterpret_cast<int32_t *>(buffer_bytes);
+      buffer_bytes += sizeof_requant_muls(args);
+
+      for (auto n = 0u; n < n_output_channels; n++)
+      {
+        muls[n] = args.output_stage.per_layer_mul;
+      }
+    }
+
+    if (ws->requant_shifts == nullptr)
+    {
+      ws->requant_shifts = reinterpret_cast<int32_t *>(buffer_bytes);
+      auto shifts = reinterpret_cast<int32_t *>(buffer_bytes);
+      buffer_bytes += sizeof_requant_shifts(args);
+
+      for (auto n = 0u; n < n_output_channels; n++)
+      {
+        shifts[n] = args.output_stage.per_layer_right_shift;
+      }
+    }
+
+    return buffer_bytes;
+  }
+
+  protected:
+  template <typename StratType>
+  static size_t sizeof_bias(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+  {
+    return args.output_stage.bias != nullptr ?
+      0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
+  }
+
+  template <typename StratType>
+  static size_t sizeof_requant_muls(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+  {
+    return args.output_stage.per_channel_muls != nullptr ?
+      0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
+  }
+
+  template <typename StratType>
+  static size_t sizeof_requant_shifts(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+  {
+    return args.output_stage.per_channel_right_shifts != nullptr ?
+      0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
+  }
+};
+
+
+template <typename ...Elements>
+class Workspace;
+
+template <typename Element, typename ...Elements>
+class Workspace<Element, Elements...>
+{
+  public:
+  struct WorkspaceType : Element::Workspace, Workspace<Elements...>::WorkspaceType
+  {
+  };
+
+  template <class S, class T>
+  static void initialise(void *buffer, const WorkspaceArgs<S, T> &args)
+  {
+    // Allocate sufficient space for the struct, then initialise each of the
+    // elements in turn.
+    auto ws = reinterpret_cast<WorkspaceType *>(buffer);
+    initialise_elements(ws, ws + 1, args);
+  }
+
+  template <class S, class T=Nothing>
+  static size_t get_sizeof_workspace(const WorkspaceArgs<S, T> &args)
+  {
+    return sizeof(WorkspaceType) + get_element_sizes(args);
+  }
+
+  template <class S, class T>
+  static inline size_t get_element_sizes(const WorkspaceArgs<S, T> &args)
+  {
+    return Element::get_element_size(args) + Workspace<Elements...>::get_element_sizes(args);
+  }
+
+  template <class WorkspaceType, class S, class T>
+  static void initialise_elements(WorkspaceType *ws, void *buffer, const WorkspaceArgs<S, T> &args)
+  {
+    buffer = Element::initialise(ws, buffer, args);  // Get the next buffer
+    Workspace<Elements...>::initialise_elements(ws, buffer, args);
+  }
+};
+
+template <>
+class Workspace<>
+{
+  public:
+  struct WorkspaceType
+  {
+  };
+
+  template <class S, class T>
+  static inline size_t get_element_sizes(const WorkspaceArgs<S, T> &)
+  {
+    return 0;
+  }
+
+  template <class WorkspaceType, class S, class T>
+  static void initialise_elements(WorkspaceType *, void *, const WorkspaceArgs<S, T> &)
+  {
+  }
+};
+
+}  // namespace {anonymous}
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp
new file mode 100644
index 0000000000..d0e8639229
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "pooling.hpp"
+#include "utils.hpp"
+
+namespace arm_conv {
+namespace pooling {
+
+class IDepthfirstStrategy
+{
+  public:
+  virtual ~IDepthfirstStrategy() = default;
+
+  virtual unsigned int get_input_rows() const = 0;
+  virtual unsigned int get_input_cols() const = 0;
+
+  virtual unsigned int get_output_rows() const = 0;
+  virtual unsigned int get_output_cols() const = 0;
+};
+
+
+template <typename T>
+struct TensorSpec
+{
+  T base;
+  size_t ld_row, ld_col;
+
+  TensorSpec(T ptr, size_t ld_row, size_t ld_col)
+  : base(ptr), ld_row(ld_row), ld_col(ld_col) {}
+};
+
+
+template <typename TInput, typename TOutput>
+class DepthfirstDriver : public PoolingCommon<TInput, TOutput>
+{
+  protected:
+  using Parent = PoolingCommon<TInput, TOutput>;
+
+  // The strategy which we're applying to solve the pooling problem.
+  std::unique_ptr<const IDepthfirstStrategy> m_strat;
+
+  /* Compute the amount of working space required for a single thread. */
+  virtual size_t get_working_size_per_thread() const = 0;
+
+  /* Initialise the working space for a thread. */
+  virtual void initialise_working_space(void *) const = 0;
+
+  /* Compute a portion of the output tensor with padding. */
+  virtual void compute_tile_padded(
+    unsigned int output_i, unsigned int output_j,
+    unsigned int output_channel_start, unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    void *working_space
+  ) const = 0;
+
+  /* Compute a portion of the work with only top/bottom padding.
+   *
+   * The default implementation of this repeatedly calls into the padded tile
+   * variant.
+   */
+  virtual void compute_row_padded_tile_row(
+    const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
+    const unsigned int output_channel_start, const unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    void *working_space
+  ) const
+  {
+    for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols())
+    {
+      this->compute_tile_padded(
+        output_i, output_j, output_channel_start, output_channel_end,
+        input, output, working_space
+      );
+    }
+  }
+
+  /* Compute a portion of the output tensor with no padding.
+   *
+   * The default implementation of this repeatedly calls into the padded
+   * variant.
+   */
+  virtual void compute_tiles_unpadded(
+    unsigned int start_output_i, unsigned int start_output_j,
+    unsigned int n_tile_rows, unsigned int n_tile_cols,
+    unsigned int output_channel_start, unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    void *working_space
+  ) const
+  {
+    for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
+    {
+      this->compute_row_padded_tile_row(
+        start_output_i, start_output_j, n_tile_cols,
+        output_channel_start, output_channel_end,
+        input, output, working_space
+      );
+      start_output_i += m_strat->get_output_rows();
+    }
+  }
+
+  void execute_internal(
+    unsigned int n_batches,
+    unsigned int input_height,
+    unsigned int input_width,
+    unsigned int n_channels,
+    const PaddingValues &padding,
+    const void *input,
+    size_t ld_input_col,
+    size_t ld_input_row,
+    size_t ld_input_batch,
+    unsigned int output_height,
+    unsigned int output_width,
+    void *output,
+    size_t ld_output_col,
+    size_t ld_output_row,
+    size_t ld_output_batch,
+    void *working_space,
+    unsigned int thread_id,
+    unsigned int n_threads
+  ) const override
+  {
+    // Get and initialise the working space for this thread.
+    void *thread_working_space =
+      static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
+    this->initialise_working_space(thread_working_space);
+
+    // Construct convenient representations of the input/output tensors.
+    TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col);
+    TensorSpec<TOutput *> output_tensor(reinterpret_cast<TOutput *>(output), ld_output_row, ld_output_col);
+
+    // If the output is a 1x1 tensor, which commonly occurs at the end of a
+    // network, then we change the threading strategy to parallelise over
+    // channels rather than rows of the tensor.
+    if (n_threads > 1 && output_height == 1 && output_width == 1)
+    {
+      // Determine how many channels should be assigned to each thread, we
+      // round up first to ensure we get a reasonable spread across the
+      // threads.
+      const auto channels_per_thread = arm_gemm::roundup(arm_gemm::roundup(n_channels, 16u), n_threads) / n_threads;
+      const auto start_channel = thread_id * channels_per_thread;
+      const auto end_channel = std::min(start_channel + channels_per_thread, n_channels);
+
+      if (start_channel >= end_channel)
+      {
+        // This thread should move on if we have insufficient work to do.
+        return;
+      }
+
+      for (; n_batches; n_batches--)
+      {
+        // We know we don't need to iterate over rows or columns here; so just
+        // execute the tile.
+        this->compute_tile_padded(
+          0, 0,  // Compute the only output point
+          start_channel, end_channel,
+          input_tensor, output_tensor, thread_working_space
+        );
+
+        // Progress the pointers for the next batch.
+        input_tensor.base += ld_input_batch;
+        output_tensor.base += ld_output_batch;
+      }
+
+      // Exit here, since we've done all the work using the different strategy.
+      return;
+    }
+
+    for (unsigned int batch = 0; batch < n_batches; batch++)
+    {
+      // Iterate over rows of the output tensor; we stripe over the tiles.
+      for (unsigned int start_output_i = thread_id * m_strat->get_output_rows();
+           start_output_i < output_height;
+           start_output_i += n_threads * m_strat->get_output_rows())
+      {
+        // Determine what (if any padding) is required on the top/bottom of
+        // this row of the convolution.
+        const auto end_output_i = start_output_i + m_strat->get_output_rows();
+        const bool pad_output_bottom = output_height < end_output_i;
+
+        const int start_input_i = start_output_i * this->m_args.pool_stride.rows - padding.top;
+        const bool pad_input_top = start_input_i < 0;
+        const int end_input_i = start_input_i + m_strat->get_input_rows();
+        const bool pad_input_bottom = static_cast<int>(input_height) < end_input_i;
+        const bool pad_row = pad_input_top || pad_input_bottom || pad_output_bottom;
+
+        // Iterate over the columns of the output tensor; we attempt to grab as
+        // much as possible of the unpadded regions, so the loop structure is a
+        // bit odd.
+        unsigned int start_output_j = 0;
+        while (start_output_j < output_width)
+        {
+          const int start_in_j = start_output_j * this->m_args.pool_stride.cols - padding.left;
+          const bool pad_input_left = start_in_j < 0;
+
+          // Determine if we can process a number of unpadded tiles in one go.
+          int n_unpadded_tiles = 0;
+          if (!pad_input_left)
+          {
+            // Determine the maximum number of tiles we could handle.
+            n_unpadded_tiles = (output_width - start_output_j) / m_strat->get_output_cols();
+
+            // Handle padding on the right hand edge
+            const int tile_stride = m_strat->get_output_cols() * this->m_args.pool_stride.cols;
+            int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols();
+            int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride;
+
+            while (n_unpadded_tiles > 0 &&
+                   (static_cast<int>(output_width) < end_output_j ||
+                    static_cast<int>(input_width) < end_input_j))
+            {
+              n_unpadded_tiles--;
+              end_output_j -= m_strat->get_output_cols();
+              end_input_j -= tile_stride;
+            }
+          }
+
+          // Process unpadded tiles, if possible, otherwise process a padded tile.
+          if (n_unpadded_tiles)
+          {
+            if (!pad_row)
+            {
+              // Completely unpadded execution
+              this->compute_tiles_unpadded(
+                start_output_i, start_output_j,
+                1, n_unpadded_tiles,  // Compute a row of unpadded tiles
+                0, n_channels,  // Compute all channels
+                input_tensor, output_tensor, thread_working_space
+              );
+            }
+            else
+            {
+              // Top/bottom padding only
+              this->compute_row_padded_tile_row(
+                start_output_i, start_output_j, n_unpadded_tiles,
+                0, n_channels,  // Compute all channels
+                input_tensor, output_tensor, thread_working_space
+              );
+            }
+            start_output_j += n_unpadded_tiles * m_strat->get_output_cols();
+          }
+          else
+          {
+            this->compute_tile_padded(
+              start_output_i, start_output_j,
+              0, n_channels,  // Compute all channels
+              input_tensor, output_tensor, thread_working_space
+            );
+            start_output_j += m_strat->get_output_cols();
+          }
+        }
+      }
+
+      // Progress the pointers for the next batch.
+      input_tensor.base += ld_input_batch;
+      output_tensor.base += ld_output_batch;
+    }
+  }
+
+  public:
+  DepthfirstDriver(const IDepthfirstStrategy *strategy, const PoolingArgs &args)
+  : Parent(args), m_strat(strategy)
+  {
+  }
+
+  size_t get_working_size(unsigned int n_threads) const override final
+  {
+    return n_threads * this->get_working_size_per_thread();
+  }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index 178db4a0b0..6b3ebe6664 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,28 @@
 
 #pragma once
 
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
 namespace arm_conv {
 namespace pooling {
 
 void a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst
+struct a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
 {
-  typedef __fp16 operand_type;
-  typedef __fp16 return_type;
+  using Parent = DepthfirstStrategy<__fp16, __fp16>;
 
-  typedef void (*kern_type)(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+  const static auto pooling_type = PoolingType::AVERAGE;
+  const static auto pool_rows = 3u, pool_cols = 3u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
+  a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int pool_rows(void) { return 3; }
-  constexpr static unsigned int pool_cols(void) { return 3; }
-
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
-
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl;
-
-  a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 89dbf5ce02..5df848d1dd 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include <cstddef>
 #include <cstdint>
 
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
 namespace arm_conv {
 namespace pooling {
@@ -82,174 +82,173 @@ void a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x4, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr d7, [%x[args], %[offsetof_rescale]]\n"
+    "ldr x3, [%x[args], %[offsetof_n_channels]]\n"
+    "cmp x3, #0x8\n"
+    "mov x4, #0x0\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
     "mov x5, #0x0\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x6, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "cmp x4, #0x8\n"
-    "ldp x7, x8, [x20, #0x0]\n"
-    "ldp x17, x16, [x20, #0x10]\n"
-    "ldp x15, x14, [x19, #0x0]\n"
-    "ldp x13, x12, [x19, #0x10]\n"
-    "ldp x11, x10, [x19, #0x20]\n"
-    "ldp x9, x28, [x19, #0x30]\n"
-    "ldp x27, x26, [x19, #0x40]\n"
-    "ldp x25, x24, [x19, #0x50]\n"
-    "ldp x23, x22, [x19, #0x60]\n"
-    "ldp x21, x20, [x19, #0x70]\n"
-    "ldr d8, [%x[args], %[offsetof_rescale]]\n"
+    "ldp x6, x7, [x21, #0x0]\n"
+    "ldp x8, x17, [x21, #0x10]\n"
+    "ldp x16, x15, [x20, #0x0]\n"
+    "ldp x14, x13, [x20, #0x10]\n"
+    "ldp x12, x11, [x20, #0x20]\n"
+    "ldp x10, x9, [x20, #0x30]\n"
+    "ldp x28, x27, [x20, #0x40]\n"
+    "ldp x26, x25, [x20, #0x50]\n"
+    "ldp x24, x23, [x20, #0x60]\n"
+    "ldp x22, x21, [x20, #0x70]\n"
     "blt 3f\n"
-    "ldr q7, [x10, x5]\n"
-    "lsr x19, x4, #0x3\n"
-    "ldr q6, [x9, x5]\n"
-    "sub x4, x4, x19, LSL #3\n"
-    "ldr q5, [x26, x5]\n"
-    "subs x19, x19, #0x1\n"
-    "ldr q4, [x25, x5]\n"
-    "ldr q3, [x14, x5]\n"
-    "ldr q2, [x13, x5]\n"
-    "ldr q1, [x11, x5]\n"
-    "ldr q0, [x27, x5]\n"
-    "ldr q31, [x28, x5]\n"
-    "ldr q30, [x24, x5]\n"
-    "ldr q29, [x22, x5]\n"
-    "ldr q28, [x21, x5]\n"
-    "ldr q27, [x15, x5]\n"
-    "ldr q26, [x12, x5]\n"
-    "ldr q25, [x23, x5]\n"
-    "ldr q24, [x20, x5]\n"
-    "add x5, x5, #0x10\n"
+    "ldr q6, [x11, x4]\n"
+    "ldr q5, [x10, x4]\n"
+    "lsr x20, x3, #0x3\n"
+    "sub x3, x3, x20, LSL #3\n"
+    "ldr q4, [x27, x4]\n"
+    "ldr q3, [x26, x4]\n"
+    "subs x20, x20, #0x1\n"
+    "ldr q2, [x15, x4]\n"
+    "ldr q1, [x14, x4]\n"
+    "ldr q0, [x12, x4]\n"
+    "ldr q31, [x28, x4]\n"
+    "ldr q30, [x9, x4]\n"
+    "ldr q29, [x25, x4]\n"
+    "ldr q28, [x23, x4]\n"
+    "ldr q27, [x22, x4]\n"
+    "ldr q26, [x16, x4]\n"
+    "ldr q25, [x13, x4]\n"
+    "ldr q24, [x24, x4]\n"
+    "ldr q23, [x21, x4]\n"
+    "add x4, x4, #0x10\n"
     "beq 2f\n"
     "1:"  // Vector: Loop
-    "fadd v17.8h, v7.8h, v6.8h\n"
-    "ldr q7, [x10, x5]\n"
-    "subs x19, x19, #0x1\n"
-    "fadd v16.8h, v5.8h, v4.8h\n"
-    "ldr q6, [x9, x5]\n"
-    "fadd v18.8h, v3.8h, v2.8h\n"
-    "ldr q5, [x26, x5]\n"
-    "fadd v23.8h, v1.8h, v0.8h\n"
-    "ldr q4, [x25, x5]\n"
-    "fadd v22.8h, v31.8h, v30.8h\n"
-    "ldr q3, [x14, x5]\n"
-    "fadd v17.8h, v17.8h, v16.8h\n"
-    "ldr q2, [x13, x5]\n"
-    "fadd v16.8h, v29.8h, v28.8h\n"
-    "ldr q1, [x11, x5]\n"
-    "fadd v19.8h, v27.8h, v23.8h\n"
-    "ldr q0, [x27, x5]\n"
-    "fadd v21.8h, v18.8h, v17.8h\n"
-    "ldr q31, [x28, x5]\n"
-    "fadd v20.8h, v16.8h, v17.8h\n"
-    "ldr q30, [x24, x5]\n"
-    "fadd v18.8h, v26.8h, v22.8h\n"
-    "ldr q29, [x22, x5]\n"
-    "fadd v17.8h, v25.8h, v23.8h\n"
-    "ldr q28, [x21, x5]\n"
-    "fadd v16.8h, v24.8h, v22.8h\n"
-    "ldr q27, [x15, x5]\n"
+    "fadd v17.8h, v6.8h, v5.8h\n"
+    "ldr q6, [x11, x4]\n"
+    "ldr q5, [x10, x4]\n"
+    "fadd v16.8h, v4.8h, v3.8h\n"
+    "ldr q4, [x27, x4]\n"
+    "ldr q3, [x26, x4]\n"
+    "fadd v19.8h, v17.8h, v16.8h\n"
+    "fadd v18.8h, v2.8h, v1.8h\n"
+    "ldr q2, [x15, x4]\n"
+    "ldr q1, [x14, x4]\n"
+    "fadd v17.8h, v0.8h, v31.8h\n"
+    "fadd v22.8h, v30.8h, v29.8h\n"
+    "ldr q0, [x12, x4]\n"
+    "ldr q31, [x28, x4]\n"
+    "fadd v16.8h, v28.8h, v27.8h\n"
+    "fadd v21.8h, v18.8h, v19.8h\n"
+    "ldr q30, [x9, x4]\n"
+    "ldr q29, [x25, x4]\n"
+    "fadd v20.8h, v16.8h, v19.8h\n"
+    "fadd v19.8h, v26.8h, v17.8h\n"
+    "ldr q28, [x23, x4]\n"
+    "ldr q27, [x22, x4]\n"
+    "fadd v18.8h, v25.8h, v22.8h\n"
+    "fadd v17.8h, v24.8h, v17.8h\n"
+    "ldr q26, [x16, x4]\n"
+    "ldr q25, [x13, x4]\n"
+    "fadd v16.8h, v23.8h, v22.8h\n"
     "fadd v19.8h, v21.8h, v19.8h\n"
-    "ldr q26, [x12, x5]\n"
+    "ldr q24, [x24, x4]\n"
+    "ldr q23, [x21, x4]\n"
     "fadd v18.8h, v21.8h, v18.8h\n"
-    "ldr q25, [x23, x5]\n"
     "fadd v17.8h, v17.8h, v20.8h\n"
-    "ldr q24, [x20, x5]\n"
+    "fadd v16.8h, v16.8h, v20.8h\n"
+    "subs x20, x20, #0x1\n"
+    "fmul v19.8h, v19.8h, v7.h[0]\n"
+    "add x4, x4, #0x10\n"
+    "fmul v18.8h, v18.8h, v7.h[1]\n"
+    "fmul v17.8h, v17.8h, v7.h[2]\n"
+    "str q19, [x6, x5]\n"
+    "fmul v16.8h, v16.8h, v7.h[3]\n"
+    "str q18, [x7, x5]\n"
+    "str q17, [x8, x5]\n"
+    "str q16, [x17, x5]\n"
     "add x5, x5, #0x10\n"
-    "fadd v16.8h, v20.8h, v16.8h\n"
-    "fmul v19.8h, v19.8h, v8.h[0]\n"
-    "str q19, [x7, x6]\n"
-    "fmul v18.8h, v18.8h, v8.h[1]\n"
-    "fmul v17.8h, v17.8h, v8.h[2]\n"
-    "str q18, [x8, x6]\n"
-    "fmul v16.8h, v16.8h, v8.h[3]\n"
-    "str q17, [x17, x6]\n"
-    "str q16, [x16, x6]\n"
-    "add x6, x6, #0x10\n"
     "bgt 1b\n"
     "2:"  // Vector: Tail
-    "fadd v17.8h, v7.8h, v6.8h\n"
-    "fadd v16.8h, v5.8h, v4.8h\n"
-    "fadd v18.8h, v3.8h, v2.8h\n"
-    "fadd v23.8h, v1.8h, v0.8h\n"
-    "fadd v17.8h, v17.8h, v16.8h\n"
-    "fadd v22.8h, v31.8h, v30.8h\n"
-    "fadd v16.8h, v29.8h, v28.8h\n"
-    "fadd v21.8h, v18.8h, v17.8h\n"
-    "fadd v19.8h, v27.8h, v23.8h\n"
-    "fadd v20.8h, v16.8h, v17.8h\n"
-    "fadd v18.8h, v26.8h, v22.8h\n"
-    "fadd v17.8h, v25.8h, v23.8h\n"
-    "fadd v16.8h, v24.8h, v22.8h\n"
+    "fadd v17.8h, v6.8h, v5.8h\n"
+    "fadd v16.8h, v4.8h, v3.8h\n"
+    "fadd v19.8h, v17.8h, v16.8h\n"
+    "fadd v18.8h, v2.8h, v1.8h\n"
+    "fadd v17.8h, v0.8h, v31.8h\n"
+    "fadd v22.8h, v30.8h, v29.8h\n"
+    "fadd v16.8h, v28.8h, v27.8h\n"
+    "fadd v21.8h, v18.8h, v19.8h\n"
+    "fadd v20.8h, v16.8h, v19.8h\n"
+    "fadd v19.8h, v26.8h, v17.8h\n"
+    "fadd v18.8h, v25.8h, v22.8h\n"
+    "fadd v17.8h, v24.8h, v17.8h\n"
+    "fadd v16.8h, v23.8h, v22.8h\n"
     "fadd v19.8h, v21.8h, v19.8h\n"
     "fadd v18.8h, v21.8h, v18.8h\n"
     "fadd v17.8h, v17.8h, v20.8h\n"
-    "fadd v16.8h, v20.8h, v16.8h\n"
-    "fmul v19.8h, v19.8h, v8.h[0]\n"
-    "str q19, [x7, x6]\n"
-    "fmul v18.8h, v18.8h, v8.h[1]\n"
-    "fmul v17.8h, v17.8h, v8.h[2]\n"
-    "str q18, [x8, x6]\n"
-    "fmul v16.8h, v16.8h, v8.h[3]\n"
-    "str q17, [x17, x6]\n"
-    "str q16, [x16, x6]\n"
-    "add x6, x6, #0x10\n"
-    "cbz x4, 4f\n"
+    "fadd v16.8h, v16.8h, v20.8h\n"
+    "fmul v19.8h, v19.8h, v7.h[0]\n"
+    "str q19, [x6, x5]\n"
+    "fmul v18.8h, v18.8h, v7.h[1]\n"
+    "fmul v17.8h, v17.8h, v7.h[2]\n"
+    "str q18, [x7, x5]\n"
+    "fmul v16.8h, v16.8h, v7.h[3]\n"
+    "str q17, [x8, x5]\n"
+    "str q16, [x17, x5]\n"
+    "add x5, x5, #0x10\n"
+    "cbz x3, 4f\n"
     "3:"  // Oddments
-    "ldr h7, [x10, x5]\n"
-    "subs x4, x4, #0x1\n"
-    "ldr h6, [x9, x5]\n"
-    "fadd v17.8h, v7.8h, v6.8h\n"
-    "ldr h5, [x26, x5]\n"
-    "ldr h4, [x25, x5]\n"
-    "fadd v16.8h, v5.8h, v4.8h\n"
-    "ldr h3, [x14, x5]\n"
-    "ldr h2, [x13, x5]\n"
-    "fadd v17.8h, v17.8h, v16.8h\n"
-    "ldr h1, [x11, x5]\n"
-    "ldr h0, [x27, x5]\n"
-    "fadd v18.8h, v3.8h, v2.8h\n"
-    "ldr h31, [x28, x5]\n"
-    "fadd v23.8h, v1.8h, v0.8h\n"
-    "ldr h30, [x24, x5]\n"
-    "fadd v21.8h, v18.8h, v17.8h\n"
-    "ldr h29, [x22, x5]\n"
-    "ldr h28, [x21, x5]\n"
-    "fadd v22.8h, v31.8h, v30.8h\n"
-    "ldr h27, [x15, x5]\n"
-    "ldr h26, [x12, x5]\n"
-    "fadd v16.8h, v29.8h, v28.8h\n"
-    "ldr h25, [x23, x5]\n"
-    "fadd v20.8h, v16.8h, v17.8h\n"
-    "ldr h24, [x20, x5]\n"
-    "add x5, x5, #0x2\n"
-    "fadd v19.8h, v27.8h, v23.8h\n"
-    "fadd v18.8h, v26.8h, v22.8h\n"
-    "fadd v17.8h, v25.8h, v23.8h\n"
-    "fadd v16.8h, v24.8h, v22.8h\n"
-    "fadd v19.8h, v21.8h, v19.8h\n"
-    "fadd v18.8h, v21.8h, v18.8h\n"
+    "ldr h17, [x11, x4]\n"
+    "ldr h16, [x10, x4]\n"
+    "fadd v18.8h, v17.8h, v16.8h\n"
+    "subs x3, x3, #0x1\n"
+    "ldr h17, [x27, x4]\n"
+    "ldr h16, [x26, x4]\n"
+    "fadd v16.8h, v17.8h, v16.8h\n"
+    "fadd v18.8h, v18.8h, v16.8h\n"
+    "ldr h17, [x15, x4]\n"
+    "ldr h16, [x14, x4]\n"
+    "fadd v16.8h, v17.8h, v16.8h\n"
+    "fadd v23.8h, v16.8h, v18.8h\n"
+    "ldr h17, [x12, x4]\n"
+    "ldr h16, [x28, x4]\n"
+    "fadd v22.8h, v17.8h, v16.8h\n"
+    "ldr h17, [x9, x4]\n"
+    "ldr h16, [x25, x4]\n"
+    "fadd v21.8h, v17.8h, v16.8h\n"
+    "ldr h17, [x23, x4]\n"
+    "ldr h16, [x22, x4]\n"
+    "fadd v16.8h, v17.8h, v16.8h\n"
+    "fadd v20.8h, v16.8h, v18.8h\n"
+    "ldr h17, [x16, x4]\n"
+    "ldr h16, [x13, x4]\n"
+    "fadd v19.8h, v17.8h, v22.8h\n"
+    "fadd v18.8h, v16.8h, v21.8h\n"
+    "ldr h17, [x24, x4]\n"
+    "ldr h16, [x21, x4]\n"
+    "fadd v17.8h, v17.8h, v22.8h\n"
+    "fadd v16.8h, v16.8h, v21.8h\n"
+    "fadd v19.8h, v23.8h, v19.8h\n"
+    "fadd v18.8h, v23.8h, v18.8h\n"
+    "add x4, x4, #0x2\n"
     "fadd v17.8h, v17.8h, v20.8h\n"
-    "fadd v16.8h, v20.8h, v16.8h\n"
-    "fmul v19.8h, v19.8h, v8.h[0]\n"
-    "str h19, [x7, x6]\n"
-    "fmul v18.8h, v18.8h, v8.h[1]\n"
-    "fmul v17.8h, v17.8h, v8.h[2]\n"
-    "str h18, [x8, x6]\n"
-    "fmul v16.8h, v16.8h, v8.h[3]\n"
-    "str h17, [x17, x6]\n"
-    "str h16, [x16, x6]\n"
-    "add x6, x6, #0x2\n"
+    "fadd v16.8h, v16.8h, v20.8h\n"
+    "fmul v19.8h, v19.8h, v7.h[0]\n"
+    "fmul v18.8h, v18.8h, v7.h[1]\n"
+    "str h19, [x6, x5]\n"
+    "fmul v17.8h, v17.8h, v7.h[2]\n"
+    "fmul v16.8h, v16.8h, v7.h[3]\n"
+    "str h18, [x7, x5]\n"
+    "str h17, [x8, x5]\n"
+    "str h16, [x17, x5]\n"
+    "add x5, x5, #0x2\n"
     "bgt 3b\n"
     "4:"  // End
-
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst.hpp
index 9dc153a764..25e7af1cee 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_fp16_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
 
-struct a64_fp16_nhwc_avg_generic_depthfirst
+struct a64_fp16_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
 {
-  typedef __fp16 operand_type;
-  typedef __fp16 return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = a64_fp16_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
   a64_fp16_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_fp16_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
index 5bef7f2bf4..f7be92e53f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
 #if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
@@ -41,308 +42,306 @@ void a64_fp16_nhwc_avg_generic_depthfirst_impl(
   const auto rescale_value = static_cast<__fp16>(1.0f / static_cast<float>(window_cells));
 
   __asm__ __volatile__(
-    "ld1r { v8.8h }, [%x[rescale_ptr]]\n"
-    "mov x28, #0x0\n"
-    "mov x27, #0x10\n" // cntb _, ALL, #1
-    "mov x26, #0x20\n" // cntb _, ALL, #2
-    "mov x25, #0x30\n" // cntb _, ALL, #3
+    "ld1r { v9.8h }, [%x[rescale_ptr]]\n"
     "cmp %x[n_channels], #0x20\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x24, #0x20\n"  // cntb _, ALL, #2
+    "mov x23, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x0\n"
     "movi v7.16b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
+    "mov x22, %x[inptrs]\n"
     "movi v6.16b, #0x0\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "movi v5.16b, #0x0\n"
-    "movi v4.16b, #0x0\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q30, [x22, x27]\n"
-    "ldr q29, [x21, x27]\n"
-    "ldr q28, [x20, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "ldr q21, [x22, x26]\n"
-    "ldr q26, [x21, x26]\n"
-    "ldr q17, [x20, x26]\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "cbz x25, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldr q2, [x21, x26]\n"
+    "ldr q1, [x20, x26]\n"
+    "ldr q0, [x21, x24]\n"
+    "ldr q31, [x20, x24]\n"
+    "ldr q30, [x21, x23]\n"
+    "ldr q29, [x20, x23]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "fadd v23.8h, v3.8h, v2.8h\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fadd v19.8h, v1.8h, v0.8h\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fadd v22.8h, v31.8h, v30.8h\n"
-    "ldr q3, [x23, x28]\n"
-    "fadd v18.8h, v29.8h, v28.8h\n"
-    "fadd v21.8h, v27.8h, v21.8h\n"
-    "ldr q2, [x22, x28]\n"
-    "fadd v17.8h, v26.8h, v17.8h\n"
-    "ldr q1, [x21, x28]\n"
-    "fadd v20.8h, v25.8h, v20.8h\n"
-    "ldr q0, [x20, x28]\n"
-    "fadd v16.8h, v24.8h, v16.8h\n"
-    "ldr q31, [x23, x27]\n"
+    "fadd v23.8h, v4.8h, v3.8h\n"
+    "fadd v19.8h, v28.8h, v22.8h\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "fadd v22.8h, v2.8h, v1.8h\n"
+    "ldr q2, [x21, x26]\n"
+    "fadd v18.8h, v27.8h, v21.8h\n"
+    "ldr q1, [x20, x26]\n"
+    "fadd v21.8h, v0.8h, v31.8h\n"
+    "ldr q0, [x21, x24]\n"
+    "fadd v17.8h, v26.8h, v20.8h\n"
+    "ldr q31, [x20, x24]\n"
+    "fadd v20.8h, v30.8h, v29.8h\n"
+    "ldr q30, [x21, x23]\n"
+    "fadd v16.8h, v25.8h, v24.8h\n"
+    "ldr q29, [x20, x23]\n"
     "fadd v19.8h, v23.8h, v19.8h\n"
-    "ldr q30, [x22, x27]\n"
     "fadd v18.8h, v22.8h, v18.8h\n"
-    "ldr q29, [x21, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "fadd v17.8h, v21.8h, v17.8h\n"
-    "ldr q28, [x20, x27]\n"
     "fadd v16.8h, v20.8h, v16.8h\n"
-    "ldr q27, [x23, x26]\n"
-    "fadd v7.8h, v7.8h, v19.8h\n"
-    "ldr q21, [x22, x26]\n"
-    "fadd v6.8h, v6.8h, v18.8h\n"
-    "ldr q26, [x21, x26]\n"
-    "fadd v5.8h, v5.8h, v17.8h\n"
-    "ldr q17, [x20, x26]\n"
-    "fadd v4.8h, v4.8h, v16.8h\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "subs x25, x25, #0x1\n"
+    "fadd v8.8h, v8.8h, v19.8h\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "fadd v7.8h, v7.8h, v18.8h\n"
+    "fadd v6.8h, v6.8h, v17.8h\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
+    "fadd v5.8h, v5.8h, v16.8h\n"
+    "add x22, x22, #0x20\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "fadd v23.8h, v3.8h, v2.8h\n"
-    "fadd v19.8h, v1.8h, v0.8h\n"
-    "fadd v22.8h, v31.8h, v30.8h\n"
-    "fadd v18.8h, v29.8h, v28.8h\n"
-    "fadd v21.8h, v27.8h, v21.8h\n"
-    "fadd v17.8h, v26.8h, v17.8h\n"
-    "fadd v20.8h, v25.8h, v20.8h\n"
-    "fadd v16.8h, v24.8h, v16.8h\n"
+    "fadd v23.8h, v4.8h, v3.8h\n"
+    "fadd v19.8h, v28.8h, v22.8h\n"
+    "fadd v22.8h, v2.8h, v1.8h\n"
+    "fadd v18.8h, v27.8h, v21.8h\n"
+    "fadd v21.8h, v0.8h, v31.8h\n"
+    "fadd v17.8h, v26.8h, v20.8h\n"
+    "fadd v20.8h, v30.8h, v29.8h\n"
+    "fadd v16.8h, v25.8h, v24.8h\n"
     "fadd v19.8h, v23.8h, v19.8h\n"
     "fadd v18.8h, v22.8h, v18.8h\n"
     "fadd v17.8h, v21.8h, v17.8h\n"
     "fadd v16.8h, v20.8h, v16.8h\n"
-    "fadd v7.8h, v7.8h, v19.8h\n"
-    "fadd v6.8h, v6.8h, v18.8h\n"
-    "fadd v5.8h, v5.8h, v17.8h\n"
-    "fadd v4.8h, v4.8h, v16.8h\n"
+    "fadd v8.8h, v8.8h, v19.8h\n"
+    "fadd v7.8h, v7.8h, v18.8h\n"
+    "fadd v6.8h, v6.8h, v17.8h\n"
+    "fadd v5.8h, v5.8h, v16.8h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "fadd v7.8h, v7.8h, v3.8h\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "fadd v6.8h, v6.8h, v31.8h\n"
-    "ldr q25, [x23, x25]\n"
-    "fadd v5.8h, v5.8h, v27.8h\n"
-    "fadd v4.8h, v4.8h, v25.8h\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd v8.8h, v8.8h, v16.8h\n"
+    "ldr q17, [x20, x26]\n"
+    "ldr q16, [x20, x24]\n"
+    "fadd v7.8h, v7.8h, v17.8h\n"
+    "fadd v6.8h, v6.8h, v16.8h\n"
+    "ldr q16, [x20, x23]\n"
+    "fadd v5.8h, v5.8h, v16.8h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "fmul v7.8h, v7.8h, v8.8h\n"
-    "str q7, [%x[outptr], x28]\n"
-    "fmul v6.8h, v6.8h, v8.8h\n"
-    "add x28, x28, #0x40\n"
-    "fmul v5.8h, v5.8h, v8.8h\n"
-    "str q6, [%x[outptr], x27]\n"
-    "fmul v4.8h, v4.8h, v8.8h\n"
-    "add x27, x27, #0x40\n"
-    "str q5, [%x[outptr], x26]\n"
-    "add x26, x26, #0x40\n"
     "sub %x[n_channels], %x[n_channels], #0x20\n"
-    "str q4, [%x[outptr], x25]\n"
-    "add x25, x25, #0x40\n"
     "cmp %x[n_channels], #0x20\n"
+    "fmul v8.8h, v8.8h, v9.8h\n"
+    "fmul v7.8h, v7.8h, v9.8h\n"
+    "fmul v6.8h, v6.8h, v9.8h\n"
+    "fmul v5.8h, v5.8h, v9.8h\n"
+    "str q8, [%x[outptr], x27]\n"
+    "add x27, x27, #0x40\n"
+    "str q7, [%x[outptr], x26]\n"
+    "add x26, x26, #0x40\n"
+    "str q6, [%x[outptr], x24]\n"
+    "add x24, x24, #0x40\n"
+    "str q5, [%x[outptr], x23]\n"
+    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 31f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x8\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
-    "movi v7.16b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "fadd v23.8h, v3.8h, v2.8h\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fadd v19.8h, v1.8h, v0.8h\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fadd v19.8h, v23.8h, v19.8h\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "fadd v7.8h, v7.8h, v19.8h\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "fadd v17.8h, v4.8h, v3.8h\n"
+    "fadd v16.8h, v28.8h, v22.8h\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "fadd v16.8h, v17.8h, v16.8h\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "fadd v8.8h, v8.8h, v16.8h\n"
+    "add x22, x22, #0x20\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "fadd v23.8h, v3.8h, v2.8h\n"
-    "fadd v19.8h, v1.8h, v0.8h\n"
-    "fadd v19.8h, v23.8h, v19.8h\n"
-    "fadd v7.8h, v7.8h, v19.8h\n"
+    "fadd v17.8h, v4.8h, v3.8h\n"
+    "fadd v16.8h, v28.8h, v22.8h\n"
+    "fadd v16.8h, v17.8h, v16.8h\n"
+    "fadd v8.8h, v8.8h, v16.8h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "fadd v7.8h, v7.8h, v3.8h\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd v8.8h, v8.8h, v16.8h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "fmul v7.8h, v7.8h, v8.8h\n"
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x10\n"
     "sub %x[n_channels], %x[n_channels], #0x8\n"
     "cmp %x[n_channels], #0x8\n"
+    "fmul v8.8h, v8.8h, v9.8h\n"
+    "str q8, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 31f\n"
     "14:"  // Oddments
-    "movi v7.16b, #0x0\n"
-    "add %x[outptr], %x[outptr], x28\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 20f\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "add %x[outptr], %x[outptr], x27\n"
+    "movi v8.16b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 20f\n"
     "15:"  // Oddments: 4 inputs loop
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "add x21, x21, x27\n"
+    "movi v4.16b, #0x0\n"
     "movi v3.16b, #0x0\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "add x23, x23, x28\n"
-    "movi v2.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "movi v1.16b, #0x0\n"
-    "add x19, x19, #0x20\n"
-    "movi v0.16b, #0x0\n"
-    "add x22, x22, x28\n"
-    "add x21, x21, x28\n"
-    "add x20, x20, x28\n"
+    "add x20, x20, x27\n"
+    "movi v28.16b, #0x0\n"
+    "movi v22.16b, #0x0\n"
     "tbz %x[n_channels], #2, 17f\n"
-    "ldr d3, [x23], #0x8\n"
-    "ldr d2, [x22], #0x8\n"
-    "ldr d1, [x21], #0x8\n"
-    "ldr d0, [x20], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
+    "ldr d3, [x22], #0x8\n"
+    "ldr d28, [x21], #0x8\n"
+    "ldr d22, [x20], #0x8\n"
     "tbz %x[n_channels], #1, 16f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
-    "ld1 { v2.s }[2], [x22], #0x4\n"
-    "ld1 { v1.s }[2], [x21], #0x4\n"
-    "ld1 { v0.s }[2], [x20], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
+    "ld1 { v3.s }[2], [x22], #0x4\n"
+    "ld1 { v28.s }[2], [x21], #0x4\n"
+    "ld1 { v22.s }[2], [x20], #0x4\n"
     "tbz %x[n_channels], #0, 19f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
-    "ld1 { v2.h }[6], [x22], #0x2\n"
-    "ld1 { v1.h }[6], [x21], #0x2\n"
-    "ld1 { v0.h }[6], [x20], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
+    "ld1 { v3.h }[6], [x22], #0x2\n"
+    "ld1 { v28.h }[6], [x21], #0x2\n"
+    "ld1 { v22.h }[6], [x20], #0x2\n"
     "b 19f\n"
     "16:"  // Oddments: 4 inputs loop: Load: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 19f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
-    "ld1 { v2.h }[4], [x22], #0x2\n"
-    "ld1 { v1.h }[4], [x21], #0x2\n"
-    "ld1 { v0.h }[4], [x20], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
+    "ld1 { v3.h }[4], [x22], #0x2\n"
+    "ld1 { v28.h }[4], [x21], #0x2\n"
+    "ld1 { v22.h }[4], [x20], #0x2\n"
     "b 19f\n"
     "17:"  // Oddments: 4 inputs loop: Load: Bit 2: Unset
     "tbz %x[n_channels], #1, 18f\n"
-    "ldr s3, [x23], #0x4\n"
-    "ldr s2, [x22], #0x4\n"
-    "ldr s1, [x21], #0x4\n"
-    "ldr s0, [x20], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
+    "ldr s3, [x22], #0x4\n"
+    "ldr s28, [x21], #0x4\n"
+    "ldr s22, [x20], #0x4\n"
     "tbz %x[n_channels], #0, 19f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
-    "ld1 { v2.h }[2], [x22], #0x2\n"
-    "ld1 { v1.h }[2], [x21], #0x2\n"
-    "ld1 { v0.h }[2], [x20], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v3.h }[2], [x22], #0x2\n"
+    "ld1 { v28.h }[2], [x21], #0x2\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
     "b 19f\n"
     "18:"  // Oddments: 4 inputs loop: Load: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 19f\n"
-    "ldr h3, [x23], #0x2\n"
-    "ldr h2, [x22], #0x2\n"
-    "ldr h1, [x21], #0x2\n"
-    "ldr h0, [x20], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
+    "ldr h3, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "ldr h22, [x20], #0x2\n"
     "19:"  // Oddments: 4 inputs loop: Load: Bit 2: End
-    "fadd v23.8h, v3.8h, v2.8h\n"
-    "subs x24, x24, #0x1\n"
-    "fadd v19.8h, v1.8h, v0.8h\n"
-    "fadd v19.8h, v23.8h, v19.8h\n"
-    "fadd v7.8h, v7.8h, v19.8h\n"
+    "fadd v17.8h, v4.8h, v3.8h\n"
+    "fadd v16.8h, v28.8h, v22.8h\n"
+    "subs x25, x25, #0x1\n"
+    "fadd v16.8h, v17.8h, v16.8h\n"
+    "fadd v8.8h, v8.8h, v16.8h\n"
     "bgt 15b\n"
     "20:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 26f\n"
     "21:"  // Oddments: Single input loop
-    "movi v3.16b, #0x0\n"
-    "ldr x23, [x19], #0x8\n"
-    "add x23, x23, x28\n"
+    "ldr x23, [x24], #0x8\n"
+    "add x23, x23, x27\n"
+    "movi v4.16b, #0x0\n"
     "tbz %x[n_channels], #2, 23f\n"
-    "ldr d3, [x23], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
     "tbz %x[n_channels], #1, 22f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
     "tbz %x[n_channels], #0, 25f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
     "b 25f\n"
     "22:"  // Oddments: Single input loop: Load: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 25f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
     "b 25f\n"
     "23:"  // Oddments: Single input loop: Load: Bit 2: Unset
     "tbz %x[n_channels], #1, 24f\n"
-    "ldr s3, [x23], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
     "tbz %x[n_channels], #0, 25f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
     "b 25f\n"
     "24:"  // Oddments: Single input loop: Load: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 25f\n"
-    "ldr h3, [x23], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
     "25:"  // Oddments: Single input loop: Load: Bit 2: End
-    "fadd v7.8h, v7.8h, v3.8h\n"
-    "subs x20, x20, #0x1\n"
+    "subs x21, x21, #0x1\n"
+    "fadd v8.8h, v8.8h, v4.8h\n"
     "bgt 21b\n"
     "26:"  // Oddments: Single input loop: End
-    "fmul v7.8h, v7.8h, v8.8h\n"
+    "fmul v8.8h, v8.8h, v9.8h\n"
     "tbz %x[n_channels], #2, 28f\n"
-    "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+    "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #1, 27f\n"
-    "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
     "tbz %x[n_channels], #0, 30f\n"
-    "st1 { v7.h }[6], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[6], [%x[outptr]], #0x2\n"
     "b 30f\n"
     "27:"  // Oddments: Store: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 30f\n"
-    "st1 { v7.h }[4], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[4], [%x[outptr]], #0x2\n"
     "b 30f\n"
     "28:"  // Oddments: Store: Bit 2: Unset
     "tbz %x[n_channels], #1, 29f\n"
-    "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
     "tbz %x[n_channels], #0, 30f\n"
-    "st1 { v7.h }[2], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[2], [%x[outptr]], #0x2\n"
     "b 30f\n"
     "29:"  // Oddments: Store: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 30f\n"
-    "st1 { v7.h }[0], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[0], [%x[outptr]], #0x2\n"
     "30:"  // Oddments: Store: Bit 2: End
-
     "31:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 9950bb8cdb..b65ac7e9fa 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,28 @@
 
 #pragma once
 
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
 namespace arm_conv {
 namespace pooling {
 
 void a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst
+struct a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
 {
-  typedef __fp16 operand_type;
-  typedef __fp16 return_type;
+  using Parent = DepthfirstStrategy<__fp16, __fp16>;
 
-  typedef void (*kern_type)(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+  a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int pool_rows(void) { return 2; }
-  constexpr static unsigned int pool_cols(void) { return 2; }
-
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
-
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
-  a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 1c461ee163..4b073b9076 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include <cstddef>
 #include <cstdint>
 
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
 namespace arm_conv {
 namespace pooling {
@@ -63,116 +63,115 @@ void a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
-    "mov x14, #0x0\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x13, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "cmp x15, #0x8\n"
-    "ldp x12, x11, [x20, #0x0]\n"
-    "ldp x10, x9, [x20, #0x10]\n"
-    "ldp x28, x27, [x19, #0x0]\n"
-    "ldp x26, x25, [x19, #0x10]\n"
-    "ldp x24, x23, [x19, #0x20]\n"
-    "ldp x22, x21, [x19, #0x30]\n"
-    "ldr x20, [x19, #0x40]\n"
+    "ldr x16, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "cmp x16, #0x8\n"
+    "mov x15, #0x0\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x14, x13, [x21, #0x0]\n"
+    "mov x12, #0x0\n"
+    "ldp x11, x10, [x21, #0x10]\n"
+    "ldp x9, x28, [x20, #0x0]\n"
+    "ldp x27, x26, [x20, #0x10]\n"
+    "ldp x25, x24, [x20, #0x20]\n"
+    "ldp x23, x22, [x20, #0x30]\n"
+    "ldr x21, [x20, #0x40]\n"
     "blt 3f\n"
-    "ldr q30, [x27, x14]\n"
-    "lsr x19, x15, #0x3\n"
-    "ldr q29, [x24, x14]\n"
-    "sub x15, x15, x19, LSL #3\n"
-    "ldr q28, [x21, x14]\n"
-    "subs x19, x19, #0x1\n"
-    "ldr q27, [x25, x14]\n"
-    "ldr q26, [x28, x14]\n"
-    "ldr q25, [x23, x14]\n"
-    "ldr q24, [x26, x14]\n"
-    "ldr q23, [x22, x14]\n"
-    "ldr q22, [x20, x14]\n"
-    "add x14, x14, #0x10\n"
+    "ldr q30, [x28, x15]\n"
+    "ldr q29, [x25, x15]\n"
+    "lsr x20, x16, #0x3\n"
+    "sub x16, x16, x20, LSL #3\n"
+    "ldr q28, [x22, x15]\n"
+    "ldr q27, [x26, x15]\n"
+    "subs x20, x20, #0x1\n"
+    "ldr q26, [x9, x15]\n"
+    "ldr q25, [x27, x15]\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr q23, [x23, x15]\n"
+    "ldr q22, [x21, x15]\n"
+    "add x15, x15, #0x10\n"
     "beq 2f\n"
     "1:"  // Vector: Loop
     "fmax v21.8h, v30.8h, v29.8h\n"
-    "ldr q30, [x27, x14]\n"
-    "subs x19, x19, #0x1\n"
+    "ldr q30, [x28, x15]\n"
     "fmax v20.8h, v29.8h, v28.8h\n"
-    "ldr q29, [x24, x14]\n"
+    "ldr q29, [x25, x15]\n"
+    "ldr q28, [x22, x15]\n"
     "fmax v19.8h, v27.8h, v26.8h\n"
-    "ldr q28, [x21, x14]\n"
+    "ldr q26, [x9, x15]\n"
     "fmax v18.8h, v25.8h, v24.8h\n"
-    "ldr q26, [x28, x14]\n"
-    "fmax v17.8h, v23.8h, v27.8h\n"
-    "ldr q27, [x25, x14]\n"
-    "fmax v16.8h, v25.8h, v22.8h\n"
-    "ldr q25, [x23, x14]\n"
+    "ldr q25, [x27, x15]\n"
+    "fmax v17.8h, v27.8h, v23.8h\n"
+    "ldr q27, [x26, x15]\n"
+    "fmax v16.8h, v24.8h, v22.8h\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr q23, [x23, x15]\n"
+    "subs x20, x20, #0x1\n"
     "fmax v19.8h, v21.8h, v19.8h\n"
-    "ldr q24, [x26, x14]\n"
-    "fmax v18.8h, v21.8h, v18.8h\n"
-    "ldr q23, [x22, x14]\n"
-    "fmax v17.8h, v20.8h, v17.8h\n"
-    "ldr q22, [x20, x14]\n"
-    "add x14, x14, #0x10\n"
+    "ldr q22, [x21, x15]\n"
+    "fmax v18.8h, v18.8h, v21.8h\n"
+    "fmax v17.8h, v17.8h, v20.8h\n"
+    "add x15, x15, #0x10\n"
     "fmax v16.8h, v20.8h, v16.8h\n"
-    "str q19, [x12, x13]\n"
-    "str q18, [x11, x13]\n"
-    "str q17, [x10, x13]\n"
-    "str q16, [x9, x13]\n"
-    "add x13, x13, #0x10\n"
+    "str q19, [x14, x12]\n"
+    "str q18, [x13, x12]\n"
+    "str q17, [x11, x12]\n"
+    "str q16, [x10, x12]\n"
+    "add x12, x12, #0x10\n"
     "bgt 1b\n"
     "2:"  // Vector: Tail
     "fmax v21.8h, v30.8h, v29.8h\n"
     "fmax v20.8h, v29.8h, v28.8h\n"
-    "fmax v19.8h, v27.8h, v26.8h\n"
+    "fmax v16.8h, v27.8h, v26.8h\n"
     "fmax v18.8h, v25.8h, v24.8h\n"
-    "fmax v17.8h, v23.8h, v27.8h\n"
-    "fmax v16.8h, v25.8h, v22.8h\n"
-    "fmax v19.8h, v21.8h, v19.8h\n"
-    "str q19, [x12, x13]\n"
-    "fmax v18.8h, v21.8h, v18.8h\n"
-    "fmax v17.8h, v20.8h, v17.8h\n"
-    "str q18, [x11, x13]\n"
-    "fmax v16.8h, v20.8h, v16.8h\n"
-    "str q17, [x10, x13]\n"
-    "str q16, [x9, x13]\n"
-    "add x13, x13, #0x10\n"
-    "cbz x15, 4f\n"
+    "fmax v17.8h, v27.8h, v23.8h\n"
+    "fmax v19.8h, v24.8h, v22.8h\n"
+    "fmax v16.8h, v21.8h, v16.8h\n"
+    "fmax v18.8h, v18.8h, v21.8h\n"
+    "str q16, [x14, x12]\n"
+    "fmax v17.8h, v17.8h, v20.8h\n"
+    "fmax v16.8h, v20.8h, v19.8h\n"
+    "str q18, [x13, x12]\n"
+    "str q17, [x11, x12]\n"
+    "str q16, [x10, x12]\n"
+    "add x12, x12, #0x10\n"
+    "cbz x16, 4f\n"
     "3:"  // Oddments
-    "ldr h30, [x27, x14]\n"
-    "subs x15, x15, #0x1\n"
-    "ldr h29, [x24, x14]\n"
-    "fmax v21.8h, v30.8h, v29.8h\n"
-    "ldr h28, [x21, x14]\n"
-    "ldr h27, [x25, x14]\n"
-    "fmax v20.8h, v29.8h, v28.8h\n"
-    "ldr h26, [x28, x14]\n"
-    "ldr h25, [x23, x14]\n"
-    "fmax v19.8h, v27.8h, v26.8h\n"
-    "ldr h24, [x26, x14]\n"
-    "ldr h23, [x22, x14]\n"
-    "fmax v19.8h, v21.8h, v19.8h\n"
-    "ldr h22, [x20, x14]\n"
-    "add x14, x14, #0x2\n"
-    "fmax v18.8h, v25.8h, v24.8h\n"
-    "str h19, [x12, x13]\n"
-    "fmax v17.8h, v23.8h, v27.8h\n"
-    "fmax v16.8h, v25.8h, v22.8h\n"
-    "fmax v18.8h, v21.8h, v18.8h\n"
-    "str h18, [x11, x13]\n"
-    "fmax v17.8h, v20.8h, v17.8h\n"
-    "fmax v16.8h, v20.8h, v16.8h\n"
-    "str h17, [x10, x13]\n"
-    "str h16, [x9, x13]\n"
-    "add x13, x13, #0x2\n"
+    "ldr h16, [x28, x15]\n"
+    "ldr h17, [x25, x15]\n"
+    "fmax v23.8h, v16.8h, v17.8h\n"
+    "subs x16, x16, #0x1\n"
+    "ldr h16, [x22, x15]\n"
+    "ldr h22, [x26, x15]\n"
+    "fmax v21.8h, v17.8h, v16.8h\n"
+    "ldr h16, [x9, x15]\n"
+    "ldr h17, [x27, x15]\n"
+    "fmax v16.8h, v22.8h, v16.8h\n"
+    "fmax v20.8h, v23.8h, v16.8h\n"
+    "ldr h19, [x24, x15]\n"
+    "ldr h16, [x23, x15]\n"
+    "fmax v18.8h, v17.8h, v19.8h\n"
+    "fmax v17.8h, v22.8h, v16.8h\n"
+    "ldr h16, [x21, x15]\n"
+    "fmax v16.8h, v19.8h, v16.8h\n"
+    "add x15, x15, #0x2\n"
+    "fmax v18.8h, v18.8h, v23.8h\n"
+    "fmax v17.8h, v17.8h, v21.8h\n"
+    "fmax v16.8h, v21.8h, v16.8h\n"
+    "str h20, [x14, x12]\n"
+    "str h18, [x13, x12]\n"
+    "str h17, [x11, x12]\n"
+    "str h16, [x10, x12]\n"
+    "add x12, x12, #0x2\n"
     "bgt 3b\n"
     "4:"  // End
-
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
-    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst.hpp
index 8bea0bf5df..4998b37b4b 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_fp16_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
 
-struct a64_fp16_nhwc_max_generic_depthfirst
+struct a64_fp16_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
 {
-  typedef __fp16 operand_type;
-  typedef __fp16 return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = a64_fp16_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
   a64_fp16_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_fp16_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
index e5f7ee3c72..c92e2cdebd 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
 #if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
@@ -39,304 +40,302 @@ void a64_fp16_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "mov x28, #0x0\n"
-    "mov x27, #0x10\n" // cntb _, ALL, #1
-    "mov x26, #0x20\n" // cntb _, ALL, #2
-    "mov x25, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x20\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x24, #0x20\n"  // cntb _, ALL, #2
+    "mov x23, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
     "mov w20, #0xfc00\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "dup v8.8h, w20\n"
     "dup v7.8h, w20\n"
-    "mov x19, %x[inptrs]\n"
     "dup v6.8h, w20\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "dup v5.8h, w20\n"
-    "dup v4.8h, w20\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q30, [x22, x27]\n"
-    "ldr q29, [x21, x27]\n"
-    "ldr q28, [x20, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "ldr q21, [x22, x26]\n"
-    "ldr q26, [x21, x26]\n"
-    "ldr q17, [x20, x26]\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldr q2, [x21, x26]\n"
+    "ldr q1, [x20, x26]\n"
+    "ldr q0, [x21, x24]\n"
+    "ldr q31, [x20, x24]\n"
+    "ldr q30, [x21, x23]\n"
+    "ldr q29, [x20, x23]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "fmax v23.8h, v3.8h, v2.8h\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fmax v19.8h, v1.8h, v0.8h\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fmax v22.8h, v31.8h, v30.8h\n"
-    "ldr q3, [x23, x28]\n"
-    "fmax v18.8h, v29.8h, v28.8h\n"
-    "fmax v21.8h, v27.8h, v21.8h\n"
-    "ldr q2, [x22, x28]\n"
-    "fmax v17.8h, v26.8h, v17.8h\n"
-    "ldr q1, [x21, x28]\n"
-    "fmax v20.8h, v25.8h, v20.8h\n"
-    "ldr q0, [x20, x28]\n"
-    "fmax v16.8h, v24.8h, v16.8h\n"
-    "ldr q31, [x23, x27]\n"
+    "fmax v23.8h, v4.8h, v3.8h\n"
+    "fmax v19.8h, v28.8h, v22.8h\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "fmax v22.8h, v2.8h, v1.8h\n"
+    "ldr q2, [x21, x26]\n"
+    "fmax v18.8h, v27.8h, v21.8h\n"
+    "ldr q1, [x20, x26]\n"
+    "fmax v21.8h, v0.8h, v31.8h\n"
+    "ldr q0, [x21, x24]\n"
+    "fmax v17.8h, v26.8h, v20.8h\n"
+    "ldr q31, [x20, x24]\n"
+    "fmax v20.8h, v30.8h, v29.8h\n"
+    "ldr q30, [x21, x23]\n"
+    "fmax v16.8h, v25.8h, v24.8h\n"
+    "ldr q29, [x20, x23]\n"
     "fmax v19.8h, v23.8h, v19.8h\n"
-    "ldr q30, [x22, x27]\n"
     "fmax v18.8h, v22.8h, v18.8h\n"
-    "ldr q29, [x21, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "fmax v17.8h, v21.8h, v17.8h\n"
-    "ldr q28, [x20, x27]\n"
     "fmax v16.8h, v20.8h, v16.8h\n"
-    "ldr q27, [x23, x26]\n"
-    "fmax v7.8h, v7.8h, v19.8h\n"
-    "ldr q21, [x22, x26]\n"
-    "fmax v6.8h, v6.8h, v18.8h\n"
-    "ldr q26, [x21, x26]\n"
-    "fmax v5.8h, v5.8h, v17.8h\n"
-    "ldr q17, [x20, x26]\n"
-    "fmax v4.8h, v4.8h, v16.8h\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "subs x25, x25, #0x1\n"
+    "fmax v8.8h, v8.8h, v19.8h\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "fmax v7.8h, v7.8h, v18.8h\n"
+    "fmax v6.8h, v6.8h, v17.8h\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
+    "fmax v5.8h, v5.8h, v16.8h\n"
+    "add x22, x22, #0x20\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "fmax v23.8h, v3.8h, v2.8h\n"
-    "fmax v19.8h, v1.8h, v0.8h\n"
-    "fmax v22.8h, v31.8h, v30.8h\n"
-    "fmax v18.8h, v29.8h, v28.8h\n"
-    "fmax v21.8h, v27.8h, v21.8h\n"
-    "fmax v17.8h, v26.8h, v17.8h\n"
-    "fmax v20.8h, v25.8h, v20.8h\n"
-    "fmax v16.8h, v24.8h, v16.8h\n"
+    "fmax v23.8h, v4.8h, v3.8h\n"
+    "fmax v19.8h, v28.8h, v22.8h\n"
+    "fmax v22.8h, v2.8h, v1.8h\n"
+    "fmax v18.8h, v27.8h, v21.8h\n"
+    "fmax v21.8h, v0.8h, v31.8h\n"
+    "fmax v17.8h, v26.8h, v20.8h\n"
+    "fmax v20.8h, v30.8h, v29.8h\n"
+    "fmax v16.8h, v25.8h, v24.8h\n"
     "fmax v19.8h, v23.8h, v19.8h\n"
     "fmax v18.8h, v22.8h, v18.8h\n"
     "fmax v17.8h, v21.8h, v17.8h\n"
     "fmax v16.8h, v20.8h, v16.8h\n"
-    "fmax v7.8h, v7.8h, v19.8h\n"
-    "fmax v6.8h, v6.8h, v18.8h\n"
-    "fmax v5.8h, v5.8h, v17.8h\n"
-    "fmax v4.8h, v4.8h, v16.8h\n"
+    "fmax v8.8h, v8.8h, v19.8h\n"
+    "fmax v7.8h, v7.8h, v18.8h\n"
+    "fmax v6.8h, v6.8h, v17.8h\n"
+    "fmax v5.8h, v5.8h, v16.8h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "fmax v7.8h, v7.8h, v3.8h\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "fmax v6.8h, v6.8h, v31.8h\n"
-    "ldr q25, [x23, x25]\n"
-    "fmax v5.8h, v5.8h, v27.8h\n"
-    "fmax v4.8h, v4.8h, v25.8h\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax v8.8h, v8.8h, v16.8h\n"
+    "ldr q17, [x20, x26]\n"
+    "ldr q16, [x20, x24]\n"
+    "fmax v7.8h, v7.8h, v17.8h\n"
+    "fmax v6.8h, v6.8h, v16.8h\n"
+    "ldr q16, [x20, x23]\n"
+    "fmax v5.8h, v5.8h, v16.8h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x40\n"
-    "str q6, [%x[outptr], x27]\n"
-    "add x27, x27, #0x40\n"
-    "str q5, [%x[outptr], x26]\n"
-    "add x26, x26, #0x40\n"
-    "str q4, [%x[outptr], x25]\n"
-    "add x25, x25, #0x40\n"
     "sub %x[n_channels], %x[n_channels], #0x20\n"
     "cmp %x[n_channels], #0x20\n"
+    "str q8, [%x[outptr], x27]\n"
+    "str q7, [%x[outptr], x26]\n"
+    "add x27, x27, #0x40\n"
+    "add x26, x26, #0x40\n"
+    "str q6, [%x[outptr], x24]\n"
+    "add x24, x24, #0x40\n"
+    "str q5, [%x[outptr], x23]\n"
+    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 31f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x8\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov w19, #0xfc00\n"
-    "dup v7.8h, w19\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "mov w20, #0xfc00\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "dup v8.8h, w20\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "fmax v23.8h, v3.8h, v2.8h\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fmax v19.8h, v1.8h, v0.8h\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fmax v19.8h, v23.8h, v19.8h\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "fmax v7.8h, v7.8h, v19.8h\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "fmax v17.8h, v4.8h, v3.8h\n"
+    "fmax v16.8h, v28.8h, v22.8h\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "fmax v16.8h, v17.8h, v16.8h\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "fmax v8.8h, v8.8h, v16.8h\n"
+    "add x22, x22, #0x20\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "fmax v23.8h, v3.8h, v2.8h\n"
-    "fmax v19.8h, v1.8h, v0.8h\n"
-    "fmax v19.8h, v23.8h, v19.8h\n"
-    "fmax v7.8h, v7.8h, v19.8h\n"
+    "fmax v17.8h, v4.8h, v3.8h\n"
+    "fmax v16.8h, v28.8h, v22.8h\n"
+    "fmax v16.8h, v17.8h, v16.8h\n"
+    "fmax v8.8h, v8.8h, v16.8h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "fmax v7.8h, v7.8h, v3.8h\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax v8.8h, v8.8h, v16.8h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x10\n"
     "sub %x[n_channels], %x[n_channels], #0x8\n"
     "cmp %x[n_channels], #0x8\n"
+    "str q8, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 31f\n"
     "14:"  // Oddments
-    "add %x[outptr], %x[outptr], x28\n"
-    "mov w19, #0xfc00\n"
-    "dup v7.8h, w19\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 20f\n"
+    "mov w20, #0xfc00\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "dup v8.8h, w20\n"
+    "add %x[outptr], %x[outptr], x27\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 20f\n"
     "15:"  // Oddments: 4 inputs loop
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "add x21, x21, x27\n"
+    "movi v4.16b, #0x0\n"
     "movi v3.16b, #0x0\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "add x23, x23, x28\n"
-    "movi v2.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "movi v1.16b, #0x0\n"
-    "add x19, x19, #0x20\n"
-    "movi v0.16b, #0x0\n"
-    "add x22, x22, x28\n"
-    "add x21, x21, x28\n"
-    "add x20, x20, x28\n"
+    "add x20, x20, x27\n"
+    "movi v28.16b, #0x0\n"
+    "movi v22.16b, #0x0\n"
     "tbz %x[n_channels], #2, 17f\n"
-    "ldr d3, [x23], #0x8\n"
-    "ldr d2, [x22], #0x8\n"
-    "ldr d1, [x21], #0x8\n"
-    "ldr d0, [x20], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
+    "ldr d3, [x22], #0x8\n"
+    "ldr d28, [x21], #0x8\n"
+    "ldr d22, [x20], #0x8\n"
     "tbz %x[n_channels], #1, 16f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
-    "ld1 { v2.s }[2], [x22], #0x4\n"
-    "ld1 { v1.s }[2], [x21], #0x4\n"
-    "ld1 { v0.s }[2], [x20], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
+    "ld1 { v3.s }[2], [x22], #0x4\n"
+    "ld1 { v28.s }[2], [x21], #0x4\n"
+    "ld1 { v22.s }[2], [x20], #0x4\n"
     "tbz %x[n_channels], #0, 19f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
-    "ld1 { v2.h }[6], [x22], #0x2\n"
-    "ld1 { v1.h }[6], [x21], #0x2\n"
-    "ld1 { v0.h }[6], [x20], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
+    "ld1 { v3.h }[6], [x22], #0x2\n"
+    "ld1 { v28.h }[6], [x21], #0x2\n"
+    "ld1 { v22.h }[6], [x20], #0x2\n"
     "b 19f\n"
     "16:"  // Oddments: 4 inputs loop: Load: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 19f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
-    "ld1 { v2.h }[4], [x22], #0x2\n"
-    "ld1 { v1.h }[4], [x21], #0x2\n"
-    "ld1 { v0.h }[4], [x20], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
+    "ld1 { v3.h }[4], [x22], #0x2\n"
+    "ld1 { v28.h }[4], [x21], #0x2\n"
+    "ld1 { v22.h }[4], [x20], #0x2\n"
     "b 19f\n"
     "17:"  // Oddments: 4 inputs loop: Load: Bit 2: Unset
     "tbz %x[n_channels], #1, 18f\n"
-    "ldr s3, [x23], #0x4\n"
-    "ldr s2, [x22], #0x4\n"
-    "ldr s1, [x21], #0x4\n"
-    "ldr s0, [x20], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
+    "ldr s3, [x22], #0x4\n"
+    "ldr s28, [x21], #0x4\n"
+    "ldr s22, [x20], #0x4\n"
     "tbz %x[n_channels], #0, 19f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
-    "ld1 { v2.h }[2], [x22], #0x2\n"
-    "ld1 { v1.h }[2], [x21], #0x2\n"
-    "ld1 { v0.h }[2], [x20], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v3.h }[2], [x22], #0x2\n"
+    "ld1 { v28.h }[2], [x21], #0x2\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
     "b 19f\n"
     "18:"  // Oddments: 4 inputs loop: Load: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 19f\n"
-    "ldr h3, [x23], #0x2\n"
-    "ldr h2, [x22], #0x2\n"
-    "ldr h1, [x21], #0x2\n"
-    "ldr h0, [x20], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
+    "ldr h3, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "ldr h22, [x20], #0x2\n"
     "19:"  // Oddments: 4 inputs loop: Load: Bit 2: End
-    "fmax v23.8h, v3.8h, v2.8h\n"
-    "subs x24, x24, #0x1\n"
-    "fmax v19.8h, v1.8h, v0.8h\n"
-    "fmax v19.8h, v23.8h, v19.8h\n"
-    "fmax v7.8h, v7.8h, v19.8h\n"
+    "fmax v17.8h, v4.8h, v3.8h\n"
+    "fmax v16.8h, v28.8h, v22.8h\n"
+    "subs x25, x25, #0x1\n"
+    "fmax v16.8h, v17.8h, v16.8h\n"
+    "fmax v8.8h, v8.8h, v16.8h\n"
     "bgt 15b\n"
     "20:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 26f\n"
     "21:"  // Oddments: Single input loop
-    "movi v3.16b, #0x0\n"
-    "ldr x23, [x19], #0x8\n"
-    "add x23, x23, x28\n"
+    "ldr x23, [x24], #0x8\n"
+    "add x23, x23, x27\n"
+    "movi v4.16b, #0x0\n"
     "tbz %x[n_channels], #2, 23f\n"
-    "ldr d3, [x23], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
     "tbz %x[n_channels], #1, 22f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
     "tbz %x[n_channels], #0, 25f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
     "b 25f\n"
     "22:"  // Oddments: Single input loop: Load: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 25f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
     "b 25f\n"
     "23:"  // Oddments: Single input loop: Load: Bit 2: Unset
     "tbz %x[n_channels], #1, 24f\n"
-    "ldr s3, [x23], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
     "tbz %x[n_channels], #0, 25f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
     "b 25f\n"
     "24:"  // Oddments: Single input loop: Load: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 25f\n"
-    "ldr h3, [x23], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
     "25:"  // Oddments: Single input loop: Load: Bit 2: End
-    "fmax v7.8h, v7.8h, v3.8h\n"
-    "subs x20, x20, #0x1\n"
+    "subs x21, x21, #0x1\n"
+    "fmax v8.8h, v8.8h, v4.8h\n"
     "bgt 21b\n"
     "26:"  // Oddments: Single input loop: End
     "tbz %x[n_channels], #2, 28f\n"
-    "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+    "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #1, 27f\n"
-    "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
     "tbz %x[n_channels], #0, 30f\n"
-    "st1 { v7.h }[6], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[6], [%x[outptr]], #0x2\n"
     "b 30f\n"
     "27:"  // Oddments: Store: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 30f\n"
-    "st1 { v7.h }[4], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[4], [%x[outptr]], #0x2\n"
     "b 30f\n"
     "28:"  // Oddments: Store: Bit 2: Unset
     "tbz %x[n_channels], #1, 29f\n"
-    "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
     "tbz %x[n_channels], #0, 30f\n"
-    "st1 { v7.h }[2], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[2], [%x[outptr]], #0x2\n"
     "b 30f\n"
     "29:"  // Oddments: Store: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 30f\n"
-    "st1 { v7.h }[0], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[0], [%x[outptr]], #0x2\n"
     "30:"  // Oddments: Store: Bit 2: End
-
     "31:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index 9a16b99a71..7add5feb1d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,33 +24,28 @@
 
 #pragma once
 
+#if defined(__aarch64__)
+
 namespace arm_conv {
 namespace pooling {
 
 void a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst
+struct a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
 {
-  typedef float operand_type;
-  typedef float return_type;
-
-  typedef void (*kern_type)(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
+  using Parent = DepthfirstStrategy<float, float>;
 
-  constexpr static unsigned int pool_rows(void) { return 3; }
-  constexpr static unsigned int pool_cols(void) { return 3; }
+  const static auto pooling_type = PoolingType::AVERAGE;
+  const static auto pool_rows = 3u, pool_cols = 3u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
+  a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl;
-
-  a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index ff8d7d8ba1..cf0047638e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 #include <cstddef>
 #include <cstdint>
 
+#if defined(__aarch64__)
+
 namespace arm_conv {
 namespace pooling {
 
@@ -80,172 +82,173 @@ void a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x4, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr q7, [%x[args], %[offsetof_rescale]]\n"
+    "ldr x3, [%x[args], %[offsetof_n_channels]]\n"
+    "cmp x3, #0x4\n"
+    "mov x4, #0x0\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
     "mov x5, #0x0\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x6, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "cmp x4, #0x4\n"
-    "ldp x7, x8, [x20, #0x0]\n"
-    "ldp x17, x16, [x20, #0x10]\n"
-    "ldp x15, x14, [x19, #0x0]\n"
-    "ldp x13, x12, [x19, #0x10]\n"
-    "ldp x11, x10, [x19, #0x20]\n"
-    "ldp x9, x28, [x19, #0x30]\n"
-    "ldp x27, x26, [x19, #0x40]\n"
-    "ldp x25, x24, [x19, #0x50]\n"
-    "ldp x23, x22, [x19, #0x60]\n"
-    "ldp x21, x20, [x19, #0x70]\n"
-    "ldr q8, [%x[args], %[offsetof_rescale]]\n"
+    "ldp x6, x7, [x21, #0x0]\n"
+    "ldp x8, x17, [x21, #0x10]\n"
+    "ldp x16, x15, [x20, #0x0]\n"
+    "ldp x14, x13, [x20, #0x10]\n"
+    "ldp x12, x11, [x20, #0x20]\n"
+    "ldp x10, x9, [x20, #0x30]\n"
+    "ldp x28, x27, [x20, #0x40]\n"
+    "ldp x26, x25, [x20, #0x50]\n"
+    "ldp x24, x23, [x20, #0x60]\n"
+    "ldp x22, x21, [x20, #0x70]\n"
     "blt 3f\n"
-    "ldr q7, [x10, x5]\n"
-    "lsr x19, x4, #0x2\n"
-    "ldr q6, [x9, x5]\n"
-    "sub x4, x4, x19, LSL #2\n"
-    "ldr q5, [x26, x5]\n"
-    "subs x19, x19, #0x1\n"
-    "ldr q4, [x25, x5]\n"
-    "ldr q3, [x14, x5]\n"
-    "ldr q2, [x13, x5]\n"
-    "ldr q1, [x11, x5]\n"
-    "ldr q0, [x27, x5]\n"
-    "ldr q31, [x28, x5]\n"
-    "ldr q30, [x24, x5]\n"
-    "ldr q29, [x22, x5]\n"
-    "ldr q28, [x21, x5]\n"
-    "ldr q27, [x15, x5]\n"
-    "ldr q26, [x12, x5]\n"
-    "ldr q25, [x23, x5]\n"
-    "ldr q24, [x20, x5]\n"
-    "add x5, x5, #0x10\n"
+    "ldr q6, [x11, x4]\n"
+    "ldr q5, [x10, x4]\n"
+    "lsr x20, x3, #0x2\n"
+    "sub x3, x3, x20, LSL #2\n"
+    "ldr q4, [x27, x4]\n"
+    "ldr q3, [x26, x4]\n"
+    "subs x20, x20, #0x1\n"
+    "ldr q2, [x15, x4]\n"
+    "ldr q1, [x14, x4]\n"
+    "ldr q0, [x12, x4]\n"
+    "ldr q31, [x28, x4]\n"
+    "ldr q30, [x9, x4]\n"
+    "ldr q29, [x25, x4]\n"
+    "ldr q28, [x23, x4]\n"
+    "ldr q27, [x22, x4]\n"
+    "ldr q26, [x16, x4]\n"
+    "ldr q25, [x13, x4]\n"
+    "ldr q24, [x24, x4]\n"
+    "ldr q23, [x21, x4]\n"
+    "add x4, x4, #0x10\n"
     "beq 2f\n"
     "1:"  // Vector: Loop
-    "fadd v17.4s, v7.4s, v6.4s\n"
-    "ldr q7, [x10, x5]\n"
-    "subs x19, x19, #0x1\n"
-    "fadd v16.4s, v5.4s, v4.4s\n"
-    "ldr q6, [x9, x5]\n"
-    "fadd v18.4s, v3.4s, v2.4s\n"
-    "ldr q5, [x26, x5]\n"
-    "fadd v23.4s, v1.4s, v0.4s\n"
-    "ldr q4, [x25, x5]\n"
-    "fadd v22.4s, v31.4s, v30.4s\n"
-    "ldr q3, [x14, x5]\n"
-    "fadd v17.4s, v17.4s, v16.4s\n"
-    "ldr q2, [x13, x5]\n"
-    "fadd v16.4s, v29.4s, v28.4s\n"
-    "ldr q1, [x11, x5]\n"
-    "fadd v19.4s, v27.4s, v23.4s\n"
-    "ldr q0, [x27, x5]\n"
-    "fadd v21.4s, v18.4s, v17.4s\n"
-    "ldr q31, [x28, x5]\n"
-    "fadd v20.4s, v16.4s, v17.4s\n"
-    "ldr q30, [x24, x5]\n"
-    "fadd v18.4s, v26.4s, v22.4s\n"
-    "ldr q29, [x22, x5]\n"
-    "fadd v17.4s, v25.4s, v23.4s\n"
-    "ldr q28, [x21, x5]\n"
-    "fadd v16.4s, v24.4s, v22.4s\n"
-    "ldr q27, [x15, x5]\n"
+    "fadd v17.4s, v6.4s, v5.4s\n"
+    "ldr q6, [x11, x4]\n"
+    "ldr q5, [x10, x4]\n"
+    "fadd v16.4s, v4.4s, v3.4s\n"
+    "ldr q4, [x27, x4]\n"
+    "ldr q3, [x26, x4]\n"
+    "fadd v19.4s, v17.4s, v16.4s\n"
+    "fadd v18.4s, v2.4s, v1.4s\n"
+    "ldr q2, [x15, x4]\n"
+    "ldr q1, [x14, x4]\n"
+    "fadd v17.4s, v0.4s, v31.4s\n"
+    "fadd v22.4s, v30.4s, v29.4s\n"
+    "ldr q0, [x12, x4]\n"
+    "ldr q31, [x28, x4]\n"
+    "fadd v16.4s, v28.4s, v27.4s\n"
+    "fadd v21.4s, v18.4s, v19.4s\n"
+    "ldr q30, [x9, x4]\n"
+    "ldr q29, [x25, x4]\n"
+    "fadd v20.4s, v16.4s, v19.4s\n"
+    "fadd v19.4s, v26.4s, v17.4s\n"
+    "ldr q28, [x23, x4]\n"
+    "ldr q27, [x22, x4]\n"
+    "fadd v18.4s, v25.4s, v22.4s\n"
+    "fadd v17.4s, v24.4s, v17.4s\n"
+    "ldr q26, [x16, x4]\n"
+    "ldr q25, [x13, x4]\n"
+    "fadd v16.4s, v23.4s, v22.4s\n"
     "fadd v19.4s, v21.4s, v19.4s\n"
-    "ldr q26, [x12, x5]\n"
+    "ldr q24, [x24, x4]\n"
+    "ldr q23, [x21, x4]\n"
     "fadd v18.4s, v21.4s, v18.4s\n"
-    "ldr q25, [x23, x5]\n"
     "fadd v17.4s, v17.4s, v20.4s\n"
-    "ldr q24, [x20, x5]\n"
+    "fadd v16.4s, v16.4s, v20.4s\n"
+    "subs x20, x20, #0x1\n"
+    "fmul v19.4s, v19.4s, v7.s[0]\n"
+    "add x4, x4, #0x10\n"
+    "fmul v18.4s, v18.4s, v7.s[1]\n"
+    "fmul v17.4s, v17.4s, v7.s[2]\n"
+    "str q19, [x6, x5]\n"
+    "fmul v16.4s, v16.4s, v7.s[3]\n"
+    "str q18, [x7, x5]\n"
+    "str q17, [x8, x5]\n"
+    "str q16, [x17, x5]\n"
     "add x5, x5, #0x10\n"
-    "fadd v16.4s, v20.4s, v16.4s\n"
-    "fmul v19.4s, v19.4s, v8.s[0]\n"
-    "str q19, [x7, x6]\n"
-    "fmul v18.4s, v18.4s, v8.s[1]\n"
-    "fmul v17.4s, v17.4s, v8.s[2]\n"
-    "str q18, [x8, x6]\n"
-    "fmul v16.4s, v16.4s, v8.s[3]\n"
-    "str q17, [x17, x6]\n"
-    "str q16, [x16, x6]\n"
-    "add x6, x6, #0x10\n"
     "bgt 1b\n"
     "2:"  // Vector: Tail
-    "fadd v17.4s, v7.4s, v6.4s\n"
-    "fadd v16.4s, v5.4s, v4.4s\n"
-    "fadd v18.4s, v3.4s, v2.4s\n"
-    "fadd v23.4s, v1.4s, v0.4s\n"
-    "fadd v17.4s, v17.4s, v16.4s\n"
-    "fadd v22.4s, v31.4s, v30.4s\n"
-    "fadd v16.4s, v29.4s, v28.4s\n"
-    "fadd v21.4s, v18.4s, v17.4s\n"
-    "fadd v19.4s, v27.4s, v23.4s\n"
-    "fadd v20.4s, v16.4s, v17.4s\n"
-    "fadd v18.4s, v26.4s, v22.4s\n"
-    "fadd v17.4s, v25.4s, v23.4s\n"
-    "fadd v16.4s, v24.4s, v22.4s\n"
+    "fadd v17.4s, v6.4s, v5.4s\n"
+    "fadd v16.4s, v4.4s, v3.4s\n"
+    "fadd v19.4s, v17.4s, v16.4s\n"
+    "fadd v18.4s, v2.4s, v1.4s\n"
+    "fadd v17.4s, v0.4s, v31.4s\n"
+    "fadd v22.4s, v30.4s, v29.4s\n"
+    "fadd v16.4s, v28.4s, v27.4s\n"
+    "fadd v21.4s, v18.4s, v19.4s\n"
+    "fadd v20.4s, v16.4s, v19.4s\n"
+    "fadd v19.4s, v26.4s, v17.4s\n"
+    "fadd v18.4s, v25.4s, v22.4s\n"
+    "fadd v17.4s, v24.4s, v17.4s\n"
+    "fadd v16.4s, v23.4s, v22.4s\n"
     "fadd v19.4s, v21.4s, v19.4s\n"
     "fadd v18.4s, v21.4s, v18.4s\n"
     "fadd v17.4s, v17.4s, v20.4s\n"
-    "fadd v16.4s, v20.4s, v16.4s\n"
-    "fmul v19.4s, v19.4s, v8.s[0]\n"
-    "str q19, [x7, x6]\n"
-    "fmul v18.4s, v18.4s, v8.s[1]\n"
-    "fmul v17.4s, v17.4s, v8.s[2]\n"
-    "str q18, [x8, x6]\n"
-    "fmul v16.4s, v16.4s, v8.s[3]\n"
-    "str q17, [x17, x6]\n"
-    "str q16, [x16, x6]\n"
-    "add x6, x6, #0x10\n"
-    "cbz x4, 4f\n"
+    "fadd v16.4s, v16.4s, v20.4s\n"
+    "fmul v19.4s, v19.4s, v7.s[0]\n"
+    "str q19, [x6, x5]\n"
+    "fmul v18.4s, v18.4s, v7.s[1]\n"
+    "fmul v17.4s, v17.4s, v7.s[2]\n"
+    "str q18, [x7, x5]\n"
+    "fmul v16.4s, v16.4s, v7.s[3]\n"
+    "str q17, [x8, x5]\n"
+    "str q16, [x17, x5]\n"
+    "add x5, x5, #0x10\n"
+    "cbz x3, 4f\n"
     "3:"  // Oddments
-    "ldr s7, [x10, x5]\n"
-    "subs x4, x4, #0x1\n"
-    "ldr s6, [x9, x5]\n"
-    "fadd v17.4s, v7.4s, v6.4s\n"
-    "ldr s5, [x26, x5]\n"
-    "ldr s4, [x25, x5]\n"
-    "fadd v16.4s, v5.4s, v4.4s\n"
-    "ldr s3, [x14, x5]\n"
-    "ldr s2, [x13, x5]\n"
-    "fadd v17.4s, v17.4s, v16.4s\n"
-    "ldr s1, [x11, x5]\n"
-    "ldr s0, [x27, x5]\n"
-    "fadd v18.4s, v3.4s, v2.4s\n"
-    "ldr s31, [x28, x5]\n"
-    "fadd v23.4s, v1.4s, v0.4s\n"
-    "ldr s30, [x24, x5]\n"
-    "fadd v21.4s, v18.4s, v17.4s\n"
-    "ldr s29, [x22, x5]\n"
-    "ldr s28, [x21, x5]\n"
-    "fadd v22.4s, v31.4s, v30.4s\n"
-    "ldr s27, [x15, x5]\n"
-    "ldr s26, [x12, x5]\n"
-    "fadd v16.4s, v29.4s, v28.4s\n"
-    "ldr s25, [x23, x5]\n"
-    "fadd v20.4s, v16.4s, v17.4s\n"
-    "ldr s24, [x20, x5]\n"
-    "add x5, x5, #0x4\n"
-    "fadd v19.4s, v27.4s, v23.4s\n"
-    "fadd v18.4s, v26.4s, v22.4s\n"
-    "fadd v17.4s, v25.4s, v23.4s\n"
-    "fadd v16.4s, v24.4s, v22.4s\n"
-    "fadd v19.4s, v21.4s, v19.4s\n"
-    "fadd v18.4s, v21.4s, v18.4s\n"
+    "ldr s17, [x11, x4]\n"
+    "ldr s16, [x10, x4]\n"
+    "fadd v18.4s, v17.4s, v16.4s\n"
+    "subs x3, x3, #0x1\n"
+    "ldr s17, [x27, x4]\n"
+    "ldr s16, [x26, x4]\n"
+    "fadd v16.4s, v17.4s, v16.4s\n"
+    "fadd v18.4s, v18.4s, v16.4s\n"
+    "ldr s17, [x15, x4]\n"
+    "ldr s16, [x14, x4]\n"
+    "fadd v16.4s, v17.4s, v16.4s\n"
+    "fadd v23.4s, v16.4s, v18.4s\n"
+    "ldr s17, [x12, x4]\n"
+    "ldr s16, [x28, x4]\n"
+    "fadd v22.4s, v17.4s, v16.4s\n"
+    "ldr s17, [x9, x4]\n"
+    "ldr s16, [x25, x4]\n"
+    "fadd v21.4s, v17.4s, v16.4s\n"
+    "ldr s17, [x23, x4]\n"
+    "ldr s16, [x22, x4]\n"
+    "fadd v16.4s, v17.4s, v16.4s\n"
+    "fadd v20.4s, v16.4s, v18.4s\n"
+    "ldr s17, [x16, x4]\n"
+    "ldr s16, [x13, x4]\n"
+    "fadd v19.4s, v17.4s, v22.4s\n"
+    "fadd v18.4s, v16.4s, v21.4s\n"
+    "ldr s17, [x24, x4]\n"
+    "ldr s16, [x21, x4]\n"
+    "fadd v17.4s, v17.4s, v22.4s\n"
+    "fadd v16.4s, v16.4s, v21.4s\n"
+    "fadd v19.4s, v23.4s, v19.4s\n"
+    "fadd v18.4s, v23.4s, v18.4s\n"
+    "add x4, x4, #0x4\n"
     "fadd v17.4s, v17.4s, v20.4s\n"
-    "fadd v16.4s, v20.4s, v16.4s\n"
-    "fmul v19.4s, v19.4s, v8.s[0]\n"
-    "str s19, [x7, x6]\n"
-    "fmul v18.4s, v18.4s, v8.s[1]\n"
-    "fmul v17.4s, v17.4s, v8.s[2]\n"
-    "str s18, [x8, x6]\n"
-    "fmul v16.4s, v16.4s, v8.s[3]\n"
-    "str s17, [x17, x6]\n"
-    "str s16, [x16, x6]\n"
-    "add x6, x6, #0x4\n"
+    "fadd v16.4s, v16.4s, v20.4s\n"
+    "fmul v19.4s, v19.4s, v7.s[0]\n"
+    "fmul v18.4s, v18.4s, v7.s[1]\n"
+    "str s19, [x6, x5]\n"
+    "fmul v17.4s, v17.4s, v7.s[2]\n"
+    "fmul v16.4s, v16.4s, v7.s[3]\n"
+    "str s18, [x7, x5]\n"
+    "str s17, [x8, x5]\n"
+    "str s16, [x17, x5]\n"
+    "add x5, x5, #0x4\n"
     "bgt 3b\n"
     "4:"  // End
-
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst.hpp
index 4ef26318d4..26895e610d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_fp32_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
 
-struct a64_fp32_nhwc_avg_generic_depthfirst
+struct a64_fp32_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
 {
-  typedef float operand_type;
-  typedef float return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = a64_fp32_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<float, float>;
   a64_fp32_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_fp32_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
index 21f705451a..d236f07b1c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
 #if defined(__aarch64__)
 
@@ -41,260 +42,258 @@ void a64_fp32_nhwc_avg_generic_depthfirst_impl(
   const auto rescale_value = static_cast<float>(1.0f / static_cast<float>(window_cells));
 
   __asm__ __volatile__(
-    "ld1r { v8.4s }, [%x[rescale_ptr]]\n"
-    "mov x28, #0x0\n"
-    "mov x27, #0x10\n" // cntb _, ALL, #1
-    "mov x26, #0x20\n" // cntb _, ALL, #2
-    "mov x25, #0x30\n" // cntb _, ALL, #3
+    "ld1r { v9.4s }, [%x[rescale_ptr]]\n"
     "cmp %x[n_channels], #0x10\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x24, #0x20\n"  // cntb _, ALL, #2
+    "mov x23, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x0\n"
     "movi v7.16b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
+    "mov x22, %x[inptrs]\n"
     "movi v6.16b, #0x0\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "movi v5.16b, #0x0\n"
-    "movi v4.16b, #0x0\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q30, [x22, x27]\n"
-    "ldr q29, [x21, x27]\n"
-    "ldr q28, [x20, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "ldr q21, [x22, x26]\n"
-    "ldr q26, [x21, x26]\n"
-    "ldr q17, [x20, x26]\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "cbz x25, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldr q2, [x21, x26]\n"
+    "ldr q1, [x20, x26]\n"
+    "ldr q0, [x21, x24]\n"
+    "ldr q31, [x20, x24]\n"
+    "ldr q30, [x21, x23]\n"
+    "ldr q29, [x20, x23]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "fadd v23.4s, v3.4s, v2.4s\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fadd v19.4s, v1.4s, v0.4s\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fadd v22.4s, v31.4s, v30.4s\n"
-    "ldr q3, [x23, x28]\n"
-    "fadd v18.4s, v29.4s, v28.4s\n"
-    "fadd v21.4s, v27.4s, v21.4s\n"
-    "ldr q2, [x22, x28]\n"
-    "fadd v17.4s, v26.4s, v17.4s\n"
-    "ldr q1, [x21, x28]\n"
-    "fadd v20.4s, v25.4s, v20.4s\n"
-    "ldr q0, [x20, x28]\n"
-    "fadd v16.4s, v24.4s, v16.4s\n"
-    "ldr q31, [x23, x27]\n"
+    "fadd v23.4s, v4.4s, v3.4s\n"
+    "fadd v19.4s, v28.4s, v22.4s\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "fadd v22.4s, v2.4s, v1.4s\n"
+    "ldr q2, [x21, x26]\n"
+    "fadd v18.4s, v27.4s, v21.4s\n"
+    "ldr q1, [x20, x26]\n"
+    "fadd v21.4s, v0.4s, v31.4s\n"
+    "ldr q0, [x21, x24]\n"
+    "fadd v17.4s, v26.4s, v20.4s\n"
+    "ldr q31, [x20, x24]\n"
+    "fadd v20.4s, v30.4s, v29.4s\n"
+    "ldr q30, [x21, x23]\n"
+    "fadd v16.4s, v25.4s, v24.4s\n"
+    "ldr q29, [x20, x23]\n"
     "fadd v19.4s, v23.4s, v19.4s\n"
-    "ldr q30, [x22, x27]\n"
     "fadd v18.4s, v22.4s, v18.4s\n"
-    "ldr q29, [x21, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "fadd v17.4s, v21.4s, v17.4s\n"
-    "ldr q28, [x20, x27]\n"
     "fadd v16.4s, v20.4s, v16.4s\n"
-    "ldr q27, [x23, x26]\n"
-    "fadd v7.4s, v7.4s, v19.4s\n"
-    "ldr q21, [x22, x26]\n"
-    "fadd v6.4s, v6.4s, v18.4s\n"
-    "ldr q26, [x21, x26]\n"
-    "fadd v5.4s, v5.4s, v17.4s\n"
-    "ldr q17, [x20, x26]\n"
-    "fadd v4.4s, v4.4s, v16.4s\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "subs x25, x25, #0x1\n"
+    "fadd v8.4s, v8.4s, v19.4s\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "fadd v7.4s, v7.4s, v18.4s\n"
+    "fadd v6.4s, v6.4s, v17.4s\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
+    "fadd v5.4s, v5.4s, v16.4s\n"
+    "add x22, x22, #0x20\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "fadd v23.4s, v3.4s, v2.4s\n"
-    "fadd v19.4s, v1.4s, v0.4s\n"
-    "fadd v22.4s, v31.4s, v30.4s\n"
-    "fadd v18.4s, v29.4s, v28.4s\n"
-    "fadd v21.4s, v27.4s, v21.4s\n"
-    "fadd v17.4s, v26.4s, v17.4s\n"
-    "fadd v20.4s, v25.4s, v20.4s\n"
-    "fadd v16.4s, v24.4s, v16.4s\n"
+    "fadd v23.4s, v4.4s, v3.4s\n"
+    "fadd v19.4s, v28.4s, v22.4s\n"
+    "fadd v22.4s, v2.4s, v1.4s\n"
+    "fadd v18.4s, v27.4s, v21.4s\n"
+    "fadd v21.4s, v0.4s, v31.4s\n"
+    "fadd v17.4s, v26.4s, v20.4s\n"
+    "fadd v20.4s, v30.4s, v29.4s\n"
+    "fadd v16.4s, v25.4s, v24.4s\n"
     "fadd v19.4s, v23.4s, v19.4s\n"
     "fadd v18.4s, v22.4s, v18.4s\n"
     "fadd v17.4s, v21.4s, v17.4s\n"
     "fadd v16.4s, v20.4s, v16.4s\n"
-    "fadd v7.4s, v7.4s, v19.4s\n"
-    "fadd v6.4s, v6.4s, v18.4s\n"
-    "fadd v5.4s, v5.4s, v17.4s\n"
-    "fadd v4.4s, v4.4s, v16.4s\n"
+    "fadd v8.4s, v8.4s, v19.4s\n"
+    "fadd v7.4s, v7.4s, v18.4s\n"
+    "fadd v6.4s, v6.4s, v17.4s\n"
+    "fadd v5.4s, v5.4s, v16.4s\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "fadd v7.4s, v7.4s, v3.4s\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "fadd v6.4s, v6.4s, v31.4s\n"
-    "ldr q25, [x23, x25]\n"
-    "fadd v5.4s, v5.4s, v27.4s\n"
-    "fadd v4.4s, v4.4s, v25.4s\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd v8.4s, v8.4s, v16.4s\n"
+    "ldr q17, [x20, x26]\n"
+    "ldr q16, [x20, x24]\n"
+    "fadd v7.4s, v7.4s, v17.4s\n"
+    "fadd v6.4s, v6.4s, v16.4s\n"
+    "ldr q16, [x20, x23]\n"
+    "fadd v5.4s, v5.4s, v16.4s\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "fmul v7.4s, v7.4s, v8.4s\n"
-    "str q7, [%x[outptr], x28]\n"
-    "fmul v6.4s, v6.4s, v8.4s\n"
-    "add x28, x28, #0x40\n"
-    "fmul v5.4s, v5.4s, v8.4s\n"
-    "str q6, [%x[outptr], x27]\n"
-    "fmul v4.4s, v4.4s, v8.4s\n"
-    "add x27, x27, #0x40\n"
-    "str q5, [%x[outptr], x26]\n"
-    "add x26, x26, #0x40\n"
     "sub %x[n_channels], %x[n_channels], #0x10\n"
-    "str q4, [%x[outptr], x25]\n"
-    "add x25, x25, #0x40\n"
     "cmp %x[n_channels], #0x10\n"
+    "fmul v8.4s, v8.4s, v9.4s\n"
+    "fmul v7.4s, v7.4s, v9.4s\n"
+    "fmul v6.4s, v6.4s, v9.4s\n"
+    "fmul v5.4s, v5.4s, v9.4s\n"
+    "str q8, [%x[outptr], x27]\n"
+    "add x27, x27, #0x40\n"
+    "str q7, [%x[outptr], x26]\n"
+    "add x26, x26, #0x40\n"
+    "str q6, [%x[outptr], x24]\n"
+    "add x24, x24, #0x40\n"
+    "str q5, [%x[outptr], x23]\n"
+    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 25f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x4\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
-    "movi v7.16b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "fadd v23.4s, v3.4s, v2.4s\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fadd v19.4s, v1.4s, v0.4s\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fadd v19.4s, v23.4s, v19.4s\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "fadd v7.4s, v7.4s, v19.4s\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "fadd v17.4s, v4.4s, v3.4s\n"
+    "fadd v16.4s, v28.4s, v22.4s\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "fadd v16.4s, v17.4s, v16.4s\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "fadd v8.4s, v8.4s, v16.4s\n"
+    "add x22, x22, #0x20\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "fadd v23.4s, v3.4s, v2.4s\n"
-    "fadd v19.4s, v1.4s, v0.4s\n"
-    "fadd v19.4s, v23.4s, v19.4s\n"
-    "fadd v7.4s, v7.4s, v19.4s\n"
+    "fadd v17.4s, v4.4s, v3.4s\n"
+    "fadd v16.4s, v28.4s, v22.4s\n"
+    "fadd v16.4s, v17.4s, v16.4s\n"
+    "fadd v8.4s, v8.4s, v16.4s\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "fadd v7.4s, v7.4s, v3.4s\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd v8.4s, v8.4s, v16.4s\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "fmul v7.4s, v7.4s, v8.4s\n"
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x10\n"
     "sub %x[n_channels], %x[n_channels], #0x4\n"
     "cmp %x[n_channels], #0x4\n"
+    "fmul v8.4s, v8.4s, v9.4s\n"
+    "str q8, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 25f\n"
     "14:"  // Oddments
-    "movi v7.16b, #0x0\n"
-    "add %x[outptr], %x[outptr], x28\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 18f\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "add %x[outptr], %x[outptr], x27\n"
+    "movi v8.16b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 18f\n"
     "15:"  // Oddments: 4 inputs loop
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "add x21, x21, x27\n"
+    "movi v4.16b, #0x0\n"
     "movi v3.16b, #0x0\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "add x23, x23, x28\n"
-    "movi v2.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "movi v1.16b, #0x0\n"
-    "add x19, x19, #0x20\n"
-    "movi v0.16b, #0x0\n"
-    "add x22, x22, x28\n"
-    "add x21, x21, x28\n"
-    "add x20, x20, x28\n"
+    "add x20, x20, x27\n"
+    "movi v28.16b, #0x0\n"
+    "movi v22.16b, #0x0\n"
     "tbz %x[n_channels], #1, 16f\n"
-    "ldr d3, [x23], #0x8\n"
-    "ldr d2, [x22], #0x8\n"
-    "ldr d1, [x21], #0x8\n"
-    "ldr d0, [x20], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
+    "ldr d3, [x22], #0x8\n"
+    "ldr d28, [x21], #0x8\n"
+    "ldr d22, [x20], #0x8\n"
     "tbz %x[n_channels], #0, 17f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
-    "ld1 { v2.s }[2], [x22], #0x4\n"
-    "ld1 { v1.s }[2], [x21], #0x4\n"
-    "ld1 { v0.s }[2], [x20], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
+    "ld1 { v3.s }[2], [x22], #0x4\n"
+    "ld1 { v28.s }[2], [x21], #0x4\n"
+    "ld1 { v22.s }[2], [x20], #0x4\n"
     "b 17f\n"
     "16:"  // Oddments: 4 inputs loop: Load: Bit 1: Unset
     "tbz %x[n_channels], #0, 17f\n"
-    "ldr s3, [x23], #0x4\n"
-    "ldr s2, [x22], #0x4\n"
-    "ldr s1, [x21], #0x4\n"
-    "ldr s0, [x20], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
+    "ldr s3, [x22], #0x4\n"
+    "ldr s28, [x21], #0x4\n"
+    "ldr s22, [x20], #0x4\n"
     "17:"  // Oddments: 4 inputs loop: Load: Bit 1: End
-    "fadd v23.4s, v3.4s, v2.4s\n"
-    "subs x24, x24, #0x1\n"
-    "fadd v19.4s, v1.4s, v0.4s\n"
-    "fadd v19.4s, v23.4s, v19.4s\n"
-    "fadd v7.4s, v7.4s, v19.4s\n"
+    "fadd v17.4s, v4.4s, v3.4s\n"
+    "fadd v16.4s, v28.4s, v22.4s\n"
+    "subs x25, x25, #0x1\n"
+    "fadd v16.4s, v17.4s, v16.4s\n"
+    "fadd v8.4s, v8.4s, v16.4s\n"
     "bgt 15b\n"
     "18:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 22f\n"
     "19:"  // Oddments: Single input loop
-    "movi v3.16b, #0x0\n"
-    "ldr x23, [x19], #0x8\n"
-    "add x23, x23, x28\n"
+    "ldr x23, [x24], #0x8\n"
+    "add x23, x23, x27\n"
+    "movi v4.16b, #0x0\n"
     "tbz %x[n_channels], #1, 20f\n"
-    "ldr d3, [x23], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
     "tbz %x[n_channels], #0, 21f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
     "b 21f\n"
     "20:"  // Oddments: Single input loop: Load: Bit 1: Unset
     "tbz %x[n_channels], #0, 21f\n"
-    "ldr s3, [x23], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
     "21:"  // Oddments: Single input loop: Load: Bit 1: End
-    "fadd v7.4s, v7.4s, v3.4s\n"
-    "subs x20, x20, #0x1\n"
+    "subs x21, x21, #0x1\n"
+    "fadd v8.4s, v8.4s, v4.4s\n"
     "bgt 19b\n"
     "22:"  // Oddments: Single input loop: End
-    "fmul v7.4s, v7.4s, v8.4s\n"
+    "fmul v8.4s, v8.4s, v9.4s\n"
     "tbz %x[n_channels], #1, 23f\n"
-    "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+    "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #0, 24f\n"
-    "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
     "b 24f\n"
     "23:"  // Oddments: Store: Bit 1: Unset
     "tbz %x[n_channels], #0, 24f\n"
-    "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
     "24:"  // Oddments: Store: Bit 1: End
-
     "25:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 9a22adf6f4..2f72b59d70 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,33 +24,28 @@
 
 #pragma once
 
+#if defined(__aarch64__)
+
 namespace arm_conv {
 namespace pooling {
 
 void a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst
+struct a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
 {
-  typedef float operand_type;
-  typedef float return_type;
-
-  typedef void (*kern_type)(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+  using Parent = DepthfirstStrategy<float, float>;
 
-  constexpr static unsigned int pool_rows(void) { return 2; }
-  constexpr static unsigned int pool_cols(void) { return 2; }
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
+  a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
-  a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index ea7e2195d1..f4202de1ed 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 #include <cstddef>
 #include <cstdint>
 
+#if defined(__aarch64__)
+
 namespace arm_conv {
 namespace pooling {
 
@@ -61,114 +63,115 @@ void a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
-    "mov x14, #0x0\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x13, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "cmp x15, #0x4\n"
-    "ldp x12, x11, [x20, #0x0]\n"
-    "ldp x10, x9, [x20, #0x10]\n"
-    "ldp x28, x27, [x19, #0x0]\n"
-    "ldp x26, x25, [x19, #0x10]\n"
-    "ldp x24, x23, [x19, #0x20]\n"
-    "ldp x22, x21, [x19, #0x30]\n"
-    "ldr x20, [x19, #0x40]\n"
+    "ldr x16, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "cmp x16, #0x4\n"
+    "mov x15, #0x0\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x14, x13, [x21, #0x0]\n"
+    "mov x12, #0x0\n"
+    "ldp x11, x10, [x21, #0x10]\n"
+    "ldp x9, x28, [x20, #0x0]\n"
+    "ldp x27, x26, [x20, #0x10]\n"
+    "ldp x25, x24, [x20, #0x20]\n"
+    "ldp x23, x22, [x20, #0x30]\n"
+    "ldr x21, [x20, #0x40]\n"
     "blt 3f\n"
-    "ldr q30, [x27, x14]\n"
-    "lsr x19, x15, #0x2\n"
-    "ldr q29, [x24, x14]\n"
-    "sub x15, x15, x19, LSL #2\n"
-    "ldr q28, [x21, x14]\n"
-    "subs x19, x19, #0x1\n"
-    "ldr q27, [x25, x14]\n"
-    "ldr q26, [x28, x14]\n"
-    "ldr q25, [x23, x14]\n"
-    "ldr q24, [x26, x14]\n"
-    "ldr q23, [x22, x14]\n"
-    "ldr q22, [x20, x14]\n"
-    "add x14, x14, #0x10\n"
+    "ldr q30, [x28, x15]\n"
+    "ldr q29, [x25, x15]\n"
+    "lsr x20, x16, #0x2\n"
+    "sub x16, x16, x20, LSL #2\n"
+    "ldr q28, [x22, x15]\n"
+    "ldr q27, [x26, x15]\n"
+    "subs x20, x20, #0x1\n"
+    "ldr q26, [x9, x15]\n"
+    "ldr q25, [x27, x15]\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr q23, [x23, x15]\n"
+    "ldr q22, [x21, x15]\n"
+    "add x15, x15, #0x10\n"
     "beq 2f\n"
     "1:"  // Vector: Loop
     "fmax v21.4s, v30.4s, v29.4s\n"
-    "ldr q30, [x27, x14]\n"
-    "subs x19, x19, #0x1\n"
+    "ldr q30, [x28, x15]\n"
     "fmax v20.4s, v29.4s, v28.4s\n"
-    "ldr q29, [x24, x14]\n"
+    "ldr q29, [x25, x15]\n"
+    "ldr q28, [x22, x15]\n"
     "fmax v19.4s, v27.4s, v26.4s\n"
-    "ldr q28, [x21, x14]\n"
+    "ldr q26, [x9, x15]\n"
     "fmax v18.4s, v25.4s, v24.4s\n"
-    "ldr q26, [x28, x14]\n"
-    "fmax v17.4s, v23.4s, v27.4s\n"
-    "ldr q27, [x25, x14]\n"
-    "fmax v16.4s, v25.4s, v22.4s\n"
-    "ldr q25, [x23, x14]\n"
+    "ldr q25, [x27, x15]\n"
+    "fmax v17.4s, v27.4s, v23.4s\n"
+    "ldr q27, [x26, x15]\n"
+    "fmax v16.4s, v24.4s, v22.4s\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr q23, [x23, x15]\n"
+    "subs x20, x20, #0x1\n"
     "fmax v19.4s, v21.4s, v19.4s\n"
-    "ldr q24, [x26, x14]\n"
-    "fmax v18.4s, v21.4s, v18.4s\n"
-    "ldr q23, [x22, x14]\n"
-    "fmax v17.4s, v20.4s, v17.4s\n"
-    "ldr q22, [x20, x14]\n"
-    "add x14, x14, #0x10\n"
+    "ldr q22, [x21, x15]\n"
+    "fmax v18.4s, v18.4s, v21.4s\n"
+    "fmax v17.4s, v17.4s, v20.4s\n"
+    "add x15, x15, #0x10\n"
     "fmax v16.4s, v20.4s, v16.4s\n"
-    "str q19, [x12, x13]\n"
-    "str q18, [x11, x13]\n"
-    "str q17, [x10, x13]\n"
-    "str q16, [x9, x13]\n"
-    "add x13, x13, #0x10\n"
+    "str q19, [x14, x12]\n"
+    "str q18, [x13, x12]\n"
+    "str q17, [x11, x12]\n"
+    "str q16, [x10, x12]\n"
+    "add x12, x12, #0x10\n"
     "bgt 1b\n"
     "2:"  // Vector: Tail
     "fmax v21.4s, v30.4s, v29.4s\n"
     "fmax v20.4s, v29.4s, v28.4s\n"
-    "fmax v19.4s, v27.4s, v26.4s\n"
+    "fmax v16.4s, v27.4s, v26.4s\n"
     "fmax v18.4s, v25.4s, v24.4s\n"
-    "fmax v17.4s, v23.4s, v27.4s\n"
-    "fmax v16.4s, v25.4s, v22.4s\n"
-    "fmax v19.4s, v21.4s, v19.4s\n"
-    "str q19, [x12, x13]\n"
-    "fmax v18.4s, v21.4s, v18.4s\n"
-    "fmax v17.4s, v20.4s, v17.4s\n"
-    "str q18, [x11, x13]\n"
-    "fmax v16.4s, v20.4s, v16.4s\n"
-    "str q17, [x10, x13]\n"
-    "str q16, [x9, x13]\n"
-    "add x13, x13, #0x10\n"
-    "cbz x15, 4f\n"
+    "fmax v17.4s, v27.4s, v23.4s\n"
+    "fmax v19.4s, v24.4s, v22.4s\n"
+    "fmax v16.4s, v21.4s, v16.4s\n"
+    "fmax v18.4s, v18.4s, v21.4s\n"
+    "str q16, [x14, x12]\n"
+    "fmax v17.4s, v17.4s, v20.4s\n"
+    "fmax v16.4s, v20.4s, v19.4s\n"
+    "str q18, [x13, x12]\n"
+    "str q17, [x11, x12]\n"
+    "str q16, [x10, x12]\n"
+    "add x12, x12, #0x10\n"
+    "cbz x16, 4f\n"
     "3:"  // Oddments
-    "ldr s30, [x27, x14]\n"
-    "subs x15, x15, #0x1\n"
-    "ldr s29, [x24, x14]\n"
-    "fmax v21.4s, v30.4s, v29.4s\n"
-    "ldr s28, [x21, x14]\n"
-    "ldr s27, [x25, x14]\n"
-    "fmax v20.4s, v29.4s, v28.4s\n"
-    "ldr s26, [x28, x14]\n"
-    "ldr s25, [x23, x14]\n"
-    "fmax v19.4s, v27.4s, v26.4s\n"
-    "ldr s24, [x26, x14]\n"
-    "ldr s23, [x22, x14]\n"
-    "fmax v19.4s, v21.4s, v19.4s\n"
-    "ldr s22, [x20, x14]\n"
-    "add x14, x14, #0x4\n"
-    "fmax v18.4s, v25.4s, v24.4s\n"
-    "str s19, [x12, x13]\n"
-    "fmax v17.4s, v23.4s, v27.4s\n"
-    "fmax v16.4s, v25.4s, v22.4s\n"
-    "fmax v18.4s, v21.4s, v18.4s\n"
-    "str s18, [x11, x13]\n"
-    "fmax v17.4s, v20.4s, v17.4s\n"
-    "fmax v16.4s, v20.4s, v16.4s\n"
-    "str s17, [x10, x13]\n"
-    "str s16, [x9, x13]\n"
-    "add x13, x13, #0x4\n"
+    "ldr s16, [x28, x15]\n"
+    "ldr s17, [x25, x15]\n"
+    "fmax v23.4s, v16.4s, v17.4s\n"
+    "subs x16, x16, #0x1\n"
+    "ldr s16, [x22, x15]\n"
+    "ldr s22, [x26, x15]\n"
+    "fmax v21.4s, v17.4s, v16.4s\n"
+    "ldr s16, [x9, x15]\n"
+    "ldr s17, [x27, x15]\n"
+    "fmax v16.4s, v22.4s, v16.4s\n"
+    "fmax v20.4s, v23.4s, v16.4s\n"
+    "ldr s19, [x24, x15]\n"
+    "ldr s16, [x23, x15]\n"
+    "fmax v18.4s, v17.4s, v19.4s\n"
+    "fmax v17.4s, v22.4s, v16.4s\n"
+    "ldr s16, [x21, x15]\n"
+    "fmax v16.4s, v19.4s, v16.4s\n"
+    "add x15, x15, #0x4\n"
+    "fmax v18.4s, v18.4s, v23.4s\n"
+    "fmax v17.4s, v17.4s, v21.4s\n"
+    "fmax v16.4s, v21.4s, v16.4s\n"
+    "str s20, [x14, x12]\n"
+    "str s18, [x13, x12]\n"
+    "str s17, [x11, x12]\n"
+    "str s16, [x10, x12]\n"
+    "add x12, x12, #0x4\n"
     "bgt 3b\n"
     "4:"  // End
-
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
-    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst.hpp
index b20ffc20cf..7577b31d7d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_fp32_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
 
-struct a64_fp32_nhwc_max_generic_depthfirst
+struct a64_fp32_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
 {
-  typedef float operand_type;
-  typedef float return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = a64_fp32_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<float, float>;
   a64_fp32_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_fp32_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
index e0acb7ac02..f4706635dc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
 #if defined(__aarch64__)
 
@@ -39,256 +40,254 @@ void a64_fp32_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "mov x28, #0x0\n"
-    "mov x27, #0x10\n" // cntb _, ALL, #1
-    "mov x26, #0x20\n" // cntb _, ALL, #2
-    "mov x25, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x10\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x24, #0x20\n"  // cntb _, ALL, #2
+    "mov x23, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
     "mov w20, #0xff800000\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "dup v8.4s, w20\n"
     "dup v7.4s, w20\n"
-    "mov x19, %x[inptrs]\n"
     "dup v6.4s, w20\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "dup v5.4s, w20\n"
-    "dup v4.4s, w20\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q30, [x22, x27]\n"
-    "ldr q29, [x21, x27]\n"
-    "ldr q28, [x20, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "ldr q21, [x22, x26]\n"
-    "ldr q26, [x21, x26]\n"
-    "ldr q17, [x20, x26]\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldr q2, [x21, x26]\n"
+    "ldr q1, [x20, x26]\n"
+    "ldr q0, [x21, x24]\n"
+    "ldr q31, [x20, x24]\n"
+    "ldr q30, [x21, x23]\n"
+    "ldr q29, [x20, x23]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "fmax v23.4s, v3.4s, v2.4s\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fmax v19.4s, v1.4s, v0.4s\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fmax v22.4s, v31.4s, v30.4s\n"
-    "ldr q3, [x23, x28]\n"
-    "fmax v18.4s, v29.4s, v28.4s\n"
-    "fmax v21.4s, v27.4s, v21.4s\n"
-    "ldr q2, [x22, x28]\n"
-    "fmax v17.4s, v26.4s, v17.4s\n"
-    "ldr q1, [x21, x28]\n"
-    "fmax v20.4s, v25.4s, v20.4s\n"
-    "ldr q0, [x20, x28]\n"
-    "fmax v16.4s, v24.4s, v16.4s\n"
-    "ldr q31, [x23, x27]\n"
+    "fmax v23.4s, v4.4s, v3.4s\n"
+    "fmax v19.4s, v28.4s, v22.4s\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "fmax v22.4s, v2.4s, v1.4s\n"
+    "ldr q2, [x21, x26]\n"
+    "fmax v18.4s, v27.4s, v21.4s\n"
+    "ldr q1, [x20, x26]\n"
+    "fmax v21.4s, v0.4s, v31.4s\n"
+    "ldr q0, [x21, x24]\n"
+    "fmax v17.4s, v26.4s, v20.4s\n"
+    "ldr q31, [x20, x24]\n"
+    "fmax v20.4s, v30.4s, v29.4s\n"
+    "ldr q30, [x21, x23]\n"
+    "fmax v16.4s, v25.4s, v24.4s\n"
+    "ldr q29, [x20, x23]\n"
     "fmax v19.4s, v23.4s, v19.4s\n"
-    "ldr q30, [x22, x27]\n"
     "fmax v18.4s, v22.4s, v18.4s\n"
-    "ldr q29, [x21, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "fmax v17.4s, v21.4s, v17.4s\n"
-    "ldr q28, [x20, x27]\n"
     "fmax v16.4s, v20.4s, v16.4s\n"
-    "ldr q27, [x23, x26]\n"
-    "fmax v7.4s, v7.4s, v19.4s\n"
-    "ldr q21, [x22, x26]\n"
-    "fmax v6.4s, v6.4s, v18.4s\n"
-    "ldr q26, [x21, x26]\n"
-    "fmax v5.4s, v5.4s, v17.4s\n"
-    "ldr q17, [x20, x26]\n"
-    "fmax v4.4s, v4.4s, v16.4s\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "subs x25, x25, #0x1\n"
+    "fmax v8.4s, v8.4s, v19.4s\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "fmax v7.4s, v7.4s, v18.4s\n"
+    "fmax v6.4s, v6.4s, v17.4s\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
+    "fmax v5.4s, v5.4s, v16.4s\n"
+    "add x22, x22, #0x20\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "fmax v23.4s, v3.4s, v2.4s\n"
-    "fmax v19.4s, v1.4s, v0.4s\n"
-    "fmax v22.4s, v31.4s, v30.4s\n"
-    "fmax v18.4s, v29.4s, v28.4s\n"
-    "fmax v21.4s, v27.4s, v21.4s\n"
-    "fmax v17.4s, v26.4s, v17.4s\n"
-    "fmax v20.4s, v25.4s, v20.4s\n"
-    "fmax v16.4s, v24.4s, v16.4s\n"
+    "fmax v23.4s, v4.4s, v3.4s\n"
+    "fmax v19.4s, v28.4s, v22.4s\n"
+    "fmax v22.4s, v2.4s, v1.4s\n"
+    "fmax v18.4s, v27.4s, v21.4s\n"
+    "fmax v21.4s, v0.4s, v31.4s\n"
+    "fmax v17.4s, v26.4s, v20.4s\n"
+    "fmax v20.4s, v30.4s, v29.4s\n"
+    "fmax v16.4s, v25.4s, v24.4s\n"
     "fmax v19.4s, v23.4s, v19.4s\n"
     "fmax v18.4s, v22.4s, v18.4s\n"
     "fmax v17.4s, v21.4s, v17.4s\n"
     "fmax v16.4s, v20.4s, v16.4s\n"
-    "fmax v7.4s, v7.4s, v19.4s\n"
-    "fmax v6.4s, v6.4s, v18.4s\n"
-    "fmax v5.4s, v5.4s, v17.4s\n"
-    "fmax v4.4s, v4.4s, v16.4s\n"
+    "fmax v8.4s, v8.4s, v19.4s\n"
+    "fmax v7.4s, v7.4s, v18.4s\n"
+    "fmax v6.4s, v6.4s, v17.4s\n"
+    "fmax v5.4s, v5.4s, v16.4s\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "fmax v7.4s, v7.4s, v3.4s\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "fmax v6.4s, v6.4s, v31.4s\n"
-    "ldr q25, [x23, x25]\n"
-    "fmax v5.4s, v5.4s, v27.4s\n"
-    "fmax v4.4s, v4.4s, v25.4s\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax v8.4s, v8.4s, v16.4s\n"
+    "ldr q17, [x20, x26]\n"
+    "ldr q16, [x20, x24]\n"
+    "fmax v7.4s, v7.4s, v17.4s\n"
+    "fmax v6.4s, v6.4s, v16.4s\n"
+    "ldr q16, [x20, x23]\n"
+    "fmax v5.4s, v5.4s, v16.4s\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x40\n"
-    "str q6, [%x[outptr], x27]\n"
-    "add x27, x27, #0x40\n"
-    "str q5, [%x[outptr], x26]\n"
-    "add x26, x26, #0x40\n"
-    "str q4, [%x[outptr], x25]\n"
-    "add x25, x25, #0x40\n"
     "sub %x[n_channels], %x[n_channels], #0x10\n"
     "cmp %x[n_channels], #0x10\n"
+    "str q8, [%x[outptr], x27]\n"
+    "str q7, [%x[outptr], x26]\n"
+    "add x27, x27, #0x40\n"
+    "add x26, x26, #0x40\n"
+    "str q6, [%x[outptr], x24]\n"
+    "add x24, x24, #0x40\n"
+    "str q5, [%x[outptr], x23]\n"
+    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 25f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x4\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov w19, #0xff800000\n"
-    "dup v7.4s, w19\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "mov w20, #0xff800000\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "dup v8.4s, w20\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "fmax v23.4s, v3.4s, v2.4s\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fmax v19.4s, v1.4s, v0.4s\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fmax v19.4s, v23.4s, v19.4s\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "fmax v7.4s, v7.4s, v19.4s\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "fmax v17.4s, v4.4s, v3.4s\n"
+    "fmax v16.4s, v28.4s, v22.4s\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "fmax v16.4s, v17.4s, v16.4s\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "fmax v8.4s, v8.4s, v16.4s\n"
+    "add x22, x22, #0x20\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "fmax v23.4s, v3.4s, v2.4s\n"
-    "fmax v19.4s, v1.4s, v0.4s\n"
-    "fmax v19.4s, v23.4s, v19.4s\n"
-    "fmax v7.4s, v7.4s, v19.4s\n"
+    "fmax v17.4s, v4.4s, v3.4s\n"
+    "fmax v16.4s, v28.4s, v22.4s\n"
+    "fmax v16.4s, v17.4s, v16.4s\n"
+    "fmax v8.4s, v8.4s, v16.4s\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "fmax v7.4s, v7.4s, v3.4s\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax v8.4s, v8.4s, v16.4s\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x10\n"
     "sub %x[n_channels], %x[n_channels], #0x4\n"
     "cmp %x[n_channels], #0x4\n"
+    "str q8, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 25f\n"
     "14:"  // Oddments
-    "add %x[outptr], %x[outptr], x28\n"
-    "mov w19, #0xff800000\n"
-    "dup v7.4s, w19\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 18f\n"
+    "mov w20, #0xff800000\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "dup v8.4s, w20\n"
+    "add %x[outptr], %x[outptr], x27\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 18f\n"
     "15:"  // Oddments: 4 inputs loop
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "add x21, x21, x27\n"
+    "movi v4.16b, #0x0\n"
     "movi v3.16b, #0x0\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "add x23, x23, x28\n"
-    "movi v2.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "movi v1.16b, #0x0\n"
-    "add x19, x19, #0x20\n"
-    "movi v0.16b, #0x0\n"
-    "add x22, x22, x28\n"
-    "add x21, x21, x28\n"
-    "add x20, x20, x28\n"
+    "add x20, x20, x27\n"
+    "movi v28.16b, #0x0\n"
+    "movi v22.16b, #0x0\n"
     "tbz %x[n_channels], #1, 16f\n"
-    "ldr d3, [x23], #0x8\n"
-    "ldr d2, [x22], #0x8\n"
-    "ldr d1, [x21], #0x8\n"
-    "ldr d0, [x20], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
+    "ldr d3, [x22], #0x8\n"
+    "ldr d28, [x21], #0x8\n"
+    "ldr d22, [x20], #0x8\n"
     "tbz %x[n_channels], #0, 17f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
-    "ld1 { v2.s }[2], [x22], #0x4\n"
-    "ld1 { v1.s }[2], [x21], #0x4\n"
-    "ld1 { v0.s }[2], [x20], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
+    "ld1 { v3.s }[2], [x22], #0x4\n"
+    "ld1 { v28.s }[2], [x21], #0x4\n"
+    "ld1 { v22.s }[2], [x20], #0x4\n"
     "b 17f\n"
     "16:"  // Oddments: 4 inputs loop: Load: Bit 1: Unset
     "tbz %x[n_channels], #0, 17f\n"
-    "ldr s3, [x23], #0x4\n"
-    "ldr s2, [x22], #0x4\n"
-    "ldr s1, [x21], #0x4\n"
-    "ldr s0, [x20], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
+    "ldr s3, [x22], #0x4\n"
+    "ldr s28, [x21], #0x4\n"
+    "ldr s22, [x20], #0x4\n"
     "17:"  // Oddments: 4 inputs loop: Load: Bit 1: End
-    "fmax v23.4s, v3.4s, v2.4s\n"
-    "subs x24, x24, #0x1\n"
-    "fmax v19.4s, v1.4s, v0.4s\n"
-    "fmax v19.4s, v23.4s, v19.4s\n"
-    "fmax v7.4s, v7.4s, v19.4s\n"
+    "fmax v17.4s, v4.4s, v3.4s\n"
+    "fmax v16.4s, v28.4s, v22.4s\n"
+    "subs x25, x25, #0x1\n"
+    "fmax v16.4s, v17.4s, v16.4s\n"
+    "fmax v8.4s, v8.4s, v16.4s\n"
     "bgt 15b\n"
     "18:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 22f\n"
     "19:"  // Oddments: Single input loop
-    "movi v3.16b, #0x0\n"
-    "ldr x23, [x19], #0x8\n"
-    "add x23, x23, x28\n"
+    "ldr x23, [x24], #0x8\n"
+    "add x23, x23, x27\n"
+    "movi v4.16b, #0x0\n"
     "tbz %x[n_channels], #1, 20f\n"
-    "ldr d3, [x23], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
     "tbz %x[n_channels], #0, 21f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
     "b 21f\n"
     "20:"  // Oddments: Single input loop: Load: Bit 1: Unset
     "tbz %x[n_channels], #0, 21f\n"
-    "ldr s3, [x23], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
     "21:"  // Oddments: Single input loop: Load: Bit 1: End
-    "fmax v7.4s, v7.4s, v3.4s\n"
-    "subs x20, x20, #0x1\n"
+    "subs x21, x21, #0x1\n"
+    "fmax v8.4s, v8.4s, v4.4s\n"
     "bgt 19b\n"
     "22:"  // Oddments: Single input loop: End
     "tbz %x[n_channels], #1, 23f\n"
-    "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+    "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #0, 24f\n"
-    "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
     "b 24f\n"
     "23:"  // Oddments: Store: Bit 1: Unset
     "tbz %x[n_channels], #0, 24f\n"
-    "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
     "24:"  // Oddments: Store: Bit 1: End
-
     "25:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst.hpp
index df66ab7a2c..de94ec0ec3 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_s8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
 
-struct a64_s8_nhwc_avg_generic_depthfirst
+struct a64_s8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = a64_s8_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
   a64_s8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_s8_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
index 405ae66755..5d082102b3 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 #include <cstring>
 #include <cmath>
 
@@ -83,27 +84,28 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
       shift_value--;
       f_rescale_value *= 2.0f;
     }
-    int64_t large_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
-    if (large_rescale_value == (1ll << 31))
+
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
     {
       shift_value++;
-      large_rescale_value >>= 1;
+      long_rescale_value >>= 1;
     }
-    rescale_value = static_cast<int32_t>(large_rescale_value);
+    rescale_value = static_cast<int32_t>(long_rescale_value);
   }
 
   __asm__ __volatile__(
-    "mov x26, #0x0\n"
-    "mov x25, #0x10\n" // cntb _, ALL, #1
-    "mov x24, #0x20\n" // cntb _, ALL, #2
-    "mov x23, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x40\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x25, #0x20\n"  // cntb _, ALL, #2
+    "mov x24, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "movi v15.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v14.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
     "movi v12.4s, #0x0\n"
     "movi v11.4s, #0x0\n"
@@ -118,43 +120,43 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
     "movi v2.4s, #0x0\n"
     "movi v1.4s, #0x0\n"
     "movi v0.4s, #0x0\n"
-    "cbz x22, 4f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ldr q31, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ldr q30, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
-    "ldr q29, [x21, x25]\n"
-    "ldr q28, [x20, x25]\n"
-    "ldr q27, [x21, x24]\n"
-    "ldr q26, [x20, x24]\n"
-    "ldr q25, [x21, x23]\n"
-    "ldr q24, [x20, x23]\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ldr q30, [x20, x27]\n"
+    "ldr q29, [x21, x26]\n"
+    "ldr q28, [x20, x26]\n"
+    "ldr q27, [x21, x25]\n"
+    "ldr q26, [x20, x25]\n"
+    "ldr q25, [x21, x24]\n"
+    "ldr q24, [x20, x24]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 2 inputs loop
     "saddl v23.8h, v31.8b, v30.8b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
     "saddl2 v22.8h, v31.16b, v30.16b\n"
-    "ldr q31, [x21, x26]\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "ldr q30, [x20, x27]\n"
     "saddl v21.8h, v29.8b, v28.8b\n"
-    "subs x22, x22, #0x1\n"
     "saddl2 v20.8h, v29.16b, v28.16b\n"
-    "ldr q30, [x20, x26]\n"
+    "ldr q29, [x21, x26]\n"
+    "ldr q28, [x20, x26]\n"
     "saddl v19.8h, v27.8b, v26.8b\n"
-    "ldr q29, [x21, x25]\n"
     "saddl2 v18.8h, v27.16b, v26.16b\n"
-    "ldr q28, [x20, x25]\n"
+    "ldr q27, [x21, x25]\n"
+    "ldr q26, [x20, x25]\n"
     "saddl v17.8h, v25.8b, v24.8b\n"
-    "ldr q27, [x21, x24]\n"
     "saddl2 v16.8h, v25.16b, v24.16b\n"
-    "ldr q26, [x20, x24]\n"
+    "ldr q25, [x21, x24]\n"
+    "ldr q24, [x20, x24]\n"
+    "subs x23, x23, #0x1\n"
     "saddw v15.4s, v15.4s, v23.4h\n"
-    "ldr q25, [x21, x23]\n"
     "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "ldr q24, [x20, x23]\n"
     "saddw v13.4s, v13.4s, v22.4h\n"
     "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "add x22, x22, #0x10\n"
     "saddw v11.4s, v11.4s, v21.4h\n"
     "saddw2 v10.4s, v10.4s, v21.8h\n"
     "saddw v9.4s, v9.4s, v20.4h\n"
@@ -194,23 +196,23 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
     "saddw v1.4s, v1.4s, v16.4h\n"
     "saddw2 v0.4s, v0.4s, v16.8h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q31, [x21, x26]\n"
-    "sxtl v23.8h, v31.8b\n"
-    "ldr q29, [x21, x25]\n"
-    "sxtl2 v22.8h, v31.16b\n"
-    "ldr q27, [x21, x24]\n"
-    "ldr q25, [x21, x23]\n"
-    "sxtl v21.8h, v29.8b\n"
-    "sxtl2 v20.8h, v29.16b\n"
-    "sxtl v19.8h, v27.8b\n"
-    "sxtl2 v18.8h, v27.16b\n"
-    "sxtl v17.8h, v25.8b\n"
-    "sxtl2 v16.8h, v25.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "sxtl v23.8h, v16.8b\n"
+    "sxtl2 v22.8h, v16.16b\n"
+    "ldr q16, [x20, x26]\n"
+    "ldr q17, [x20, x25]\n"
+    "sxtl v21.8h, v16.8b\n"
+    "sxtl2 v20.8h, v16.16b\n"
+    "ldr q16, [x20, x24]\n"
+    "sxtl v19.8h, v17.8b\n"
+    "sxtl2 v18.8h, v17.16b\n"
+    "subs x23, x23, #0x1\n"
+    "sxtl v17.8h, v16.8b\n"
+    "sxtl2 v16.8h, v16.16b\n"
     "saddw v15.4s, v15.4s, v23.4h\n"
     "saddw2 v14.4s, v14.4s, v23.8h\n"
     "saddw v13.4s, v13.4s, v22.4h\n"
@@ -229,195 +231,195 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
     "saddw2 v0.4s, v0.4s, v16.8h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "movi v19.4s, #0x7f\n"
-    "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
+    "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
+    "ld1r { v16.4s }, [%x[shift_ptr]]\n"
+    "sqdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqdmulh v14.4s, v14.4s, v17.4s\n"
+    "sqdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqdmulh v12.4s, v12.4s, v17.4s\n"
     "sub %x[n_channels], %x[n_channels], #0x40\n"
-    "sqdmulh v15.4s, v15.4s, v18.4s\n"
-    "ld1r { v17.4s }, [%x[shift_ptr]]\n"
-    "not v16.16b, v19.16b\n"
-    "sqdmulh v14.4s, v14.4s, v18.4s\n"
     "cmp %x[n_channels], #0x40\n"
-    "sqdmulh v13.4s, v13.4s, v18.4s\n"
-    "sqdmulh v12.4s, v12.4s, v18.4s\n"
-    "sqdmulh v11.4s, v11.4s, v18.4s\n"
-    "sqdmulh v10.4s, v10.4s, v18.4s\n"
-    "sqdmulh v9.4s, v9.4s, v18.4s\n"
-    "srshl v15.4s, v15.4s, v17.4s\n"
-    "srshl v14.4s, v14.4s, v17.4s\n"
-    "srshl v13.4s, v13.4s, v17.4s\n"
-    "srshl v12.4s, v12.4s, v17.4s\n"
-    "srshl v11.4s, v11.4s, v17.4s\n"
-    "srshl v10.4s, v10.4s, v17.4s\n"
-    "srshl v9.4s, v9.4s, v17.4s\n"
-    "sqdmulh v8.4s, v8.4s, v18.4s\n"
-    "sqdmulh v7.4s, v7.4s, v18.4s\n"
-    "sqdmulh v6.4s, v6.4s, v18.4s\n"
-    "sqdmulh v5.4s, v5.4s, v18.4s\n"
-    "srshl v8.4s, v8.4s, v17.4s\n"
-    "srshl v7.4s, v7.4s, v17.4s\n"
-    "srshl v6.4s, v6.4s, v17.4s\n"
-    "srshl v5.4s, v5.4s, v17.4s\n"
-    "sqdmulh v4.4s, v4.4s, v18.4s\n"
-    "sqdmulh v3.4s, v3.4s, v18.4s\n"
-    "sqdmulh v2.4s, v2.4s, v18.4s\n"
-    "sqdmulh v1.4s, v1.4s, v18.4s\n"
-    "srshl v4.4s, v4.4s, v17.4s\n"
-    "srshl v3.4s, v3.4s, v17.4s\n"
-    "srshl v2.4s, v2.4s, v17.4s\n"
-    "srshl v1.4s, v1.4s, v17.4s\n"
-    "sqdmulh v0.4s, v0.4s, v18.4s\n"
+    "sqdmulh v11.4s, v11.4s, v17.4s\n"
+    "sqdmulh v10.4s, v10.4s, v17.4s\n"
+    "sqdmulh v9.4s, v9.4s, v17.4s\n"
+    "sqdmulh v8.4s, v8.4s, v17.4s\n"
+    "sqdmulh v7.4s, v7.4s, v17.4s\n"
+    "sqdmulh v6.4s, v6.4s, v17.4s\n"
+    "sqdmulh v5.4s, v5.4s, v17.4s\n"
+    "sqdmulh v4.4s, v4.4s, v17.4s\n"
+    "sqdmulh v3.4s, v3.4s, v17.4s\n"
+    "sqdmulh v2.4s, v2.4s, v17.4s\n"
+    "sqdmulh v1.4s, v1.4s, v17.4s\n"
+    "sqdmulh v0.4s, v0.4s, v17.4s\n"
+    "movi v17.4s, #0x7f\n"
+    "srshl v15.4s, v15.4s, v16.4s\n"
+    "srshl v14.4s, v14.4s, v16.4s\n"
+    "srshl v13.4s, v13.4s, v16.4s\n"
+    "srshl v12.4s, v12.4s, v16.4s\n"
+    "srshl v11.4s, v11.4s, v16.4s\n"
+    "srshl v10.4s, v10.4s, v16.4s\n"
+    "srshl v9.4s, v9.4s, v16.4s\n"
+    "srshl v8.4s, v8.4s, v16.4s\n"
+    "srshl v7.4s, v7.4s, v16.4s\n"
+    "srshl v6.4s, v6.4s, v16.4s\n"
+    "srshl v5.4s, v5.4s, v16.4s\n"
+    "srshl v4.4s, v4.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v16.4s\n"
+    "srshl v2.4s, v2.4s, v16.4s\n"
+    "srshl v1.4s, v1.4s, v16.4s\n"
+    "srshl v0.4s, v0.4s, v16.4s\n"
+    "not v16.16b, v17.16b\n"
     "smax v15.4s, v15.4s, v16.4s\n"
     "smax v14.4s, v14.4s, v16.4s\n"
     "smax v13.4s, v13.4s, v16.4s\n"
-    "srshl v0.4s, v0.4s, v17.4s\n"
-    "smin v15.4s, v15.4s, v19.4s\n"
-    "smin v14.4s, v14.4s, v19.4s\n"
-    "smin v13.4s, v13.4s, v19.4s\n"
     "smax v12.4s, v12.4s, v16.4s\n"
     "smax v11.4s, v11.4s, v16.4s\n"
     "smax v10.4s, v10.4s, v16.4s\n"
-    "smin v12.4s, v12.4s, v19.4s\n"
-    "smin v11.4s, v11.4s, v19.4s\n"
-    "smin v10.4s, v10.4s, v19.4s\n"
     "smax v9.4s, v9.4s, v16.4s\n"
     "smax v8.4s, v8.4s, v16.4s\n"
     "smax v7.4s, v7.4s, v16.4s\n"
-    "smin v9.4s, v9.4s, v19.4s\n"
-    "smin v8.4s, v8.4s, v19.4s\n"
-    "smin v7.4s, v7.4s, v19.4s\n"
     "smax v6.4s, v6.4s, v16.4s\n"
     "smax v5.4s, v5.4s, v16.4s\n"
     "smax v4.4s, v4.4s, v16.4s\n"
-    "smin v6.4s, v6.4s, v19.4s\n"
-    "smin v5.4s, v5.4s, v19.4s\n"
-    "smin v4.4s, v4.4s, v19.4s\n"
     "smax v3.4s, v3.4s, v16.4s\n"
     "smax v2.4s, v2.4s, v16.4s\n"
     "smax v1.4s, v1.4s, v16.4s\n"
-    "smin v3.4s, v3.4s, v19.4s\n"
-    "smin v2.4s, v2.4s, v19.4s\n"
-    "smin v1.4s, v1.4s, v19.4s\n"
     "smax v0.4s, v0.4s, v16.4s\n"
+    "smin v15.4s, v15.4s, v17.4s\n"
+    "smin v14.4s, v14.4s, v17.4s\n"
+    "smin v13.4s, v13.4s, v17.4s\n"
+    "smin v12.4s, v12.4s, v17.4s\n"
+    "smin v11.4s, v11.4s, v17.4s\n"
+    "smin v10.4s, v10.4s, v17.4s\n"
+    "smin v9.4s, v9.4s, v17.4s\n"
+    "smin v8.4s, v8.4s, v17.4s\n"
+    "smin v7.4s, v7.4s, v17.4s\n"
+    "smin v6.4s, v6.4s, v17.4s\n"
+    "smin v5.4s, v5.4s, v17.4s\n"
+    "smin v4.4s, v4.4s, v17.4s\n"
+    "smin v3.4s, v3.4s, v17.4s\n"
+    "smin v2.4s, v2.4s, v17.4s\n"
+    "smin v1.4s, v1.4s, v17.4s\n"
+    "smin v0.4s, v0.4s, v17.4s\n"
     "uzp1 v23.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "smin v0.4s, v0.4s, v19.4s\n"
     "uzp1 v22.16b, v11.16b, v10.16b\n"
-    "uzp1 v21.16b, v9.16b, v8.16b\n"
-    "uzp1 v20.16b, v7.16b, v6.16b\n"
+    "uzp1 v18.16b, v9.16b, v8.16b\n"
+    "uzp1 v21.16b, v7.16b, v6.16b\n"
     "uzp1 v17.16b, v5.16b, v4.16b\n"
-    "uzp1 v19.16b, v3.16b, v2.16b\n"
-    "uzp1 v18.16b, v1.16b, v0.16b\n"
+    "uzp1 v20.16b, v3.16b, v2.16b\n"
+    "uzp1 v19.16b, v1.16b, v0.16b\n"
     "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x26]\n"
-    "uzp1 v16.16b, v22.16b, v21.16b\n"
+    "uzp1 v18.16b, v22.16b, v18.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x40\n"
+    "uzp1 v17.16b, v21.16b, v17.16b\n"
+    "uzp1 v16.16b, v20.16b, v19.16b\n"
+    "str q18, [%x[outptr], x26]\n"
     "add x26, x26, #0x40\n"
-    "uzp1 v17.16b, v20.16b, v17.16b\n"
-    "str q16, [%x[outptr], x25]\n"
-    "uzp1 v16.16b, v19.16b, v18.16b\n"
+    "str q17, [%x[outptr], x25]\n"
     "add x25, x25, #0x40\n"
-    "str q17, [%x[outptr], x24]\n"
+    "str q16, [%x[outptr], x24]\n"
     "add x24, x24, #0x40\n"
-    "str q16, [%x[outptr], x23]\n"
-    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 43f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x10\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "movi v15.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v14.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
     "movi v12.4s, #0x0\n"
-    "cbz x22, 11f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ldr q31, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ldr q30, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ldr q30, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 2 inputs loop
-    "saddl v23.8h, v31.8b, v30.8b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
-    "saddl2 v22.8h, v31.16b, v30.16b\n"
-    "ldr q31, [x21, x26]\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "subs x22, x22, #0x1\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "ldr q30, [x20, x26]\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "saddl v17.8h, v31.8b, v30.8b\n"
+    "saddl2 v16.8h, v31.16b, v30.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "ldr q30, [x20, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
+    "add x22, x22, #0x10\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 2 inputs tail
-    "saddl v23.8h, v31.8b, v30.8b\n"
-    "saddl2 v22.8h, v31.16b, v30.16b\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "saddl v17.8h, v31.8b, v30.8b\n"
+    "saddl2 v16.8h, v31.16b, v30.16b\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q31, [x21, x26]\n"
-    "sxtl v23.8h, v31.8b\n"
-    "sxtl2 v22.8h, v31.16b\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "sxtl v17.8h, v16.8b\n"
+    "sxtl2 v16.8h, v16.16b\n"
+    "subs x23, x23, #0x1\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "movi v19.4s, #0x7f\n"
-    "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
+    "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
+    "ld1r { v16.4s }, [%x[shift_ptr]]\n"
+    "sqdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqdmulh v14.4s, v14.4s, v17.4s\n"
+    "sqdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqdmulh v12.4s, v12.4s, v17.4s\n"
     "sub %x[n_channels], %x[n_channels], #0x10\n"
-    "sqdmulh v15.4s, v15.4s, v18.4s\n"
-    "ld1r { v17.4s }, [%x[shift_ptr]]\n"
-    "not v16.16b, v19.16b\n"
-    "sqdmulh v14.4s, v14.4s, v18.4s\n"
     "cmp %x[n_channels], #0x10\n"
-    "sqdmulh v13.4s, v13.4s, v18.4s\n"
-    "sqdmulh v12.4s, v12.4s, v18.4s\n"
-    "srshl v15.4s, v15.4s, v17.4s\n"
-    "srshl v14.4s, v14.4s, v17.4s\n"
-    "srshl v13.4s, v13.4s, v17.4s\n"
-    "srshl v12.4s, v12.4s, v17.4s\n"
+    "movi v17.4s, #0x7f\n"
+    "srshl v15.4s, v15.4s, v16.4s\n"
+    "srshl v14.4s, v14.4s, v16.4s\n"
+    "srshl v13.4s, v13.4s, v16.4s\n"
+    "srshl v12.4s, v12.4s, v16.4s\n"
+    "not v16.16b, v17.16b\n"
     "smax v15.4s, v15.4s, v16.4s\n"
     "smax v14.4s, v14.4s, v16.4s\n"
     "smax v13.4s, v13.4s, v16.4s\n"
     "smax v12.4s, v12.4s, v16.4s\n"
-    "smin v15.4s, v15.4s, v19.4s\n"
-    "smin v14.4s, v14.4s, v19.4s\n"
-    "smin v13.4s, v13.4s, v19.4s\n"
-    "smin v12.4s, v12.4s, v19.4s\n"
-    "uzp1 v23.16b, v15.16b, v14.16b\n"
+    "smin v15.4s, v15.4s, v17.4s\n"
+    "smin v14.4s, v14.4s, v17.4s\n"
+    "smin v13.4s, v13.4s, v17.4s\n"
+    "smin v12.4s, v12.4s, v17.4s\n"
+    "uzp1 v17.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x26]\n"
-    "add x26, x26, #0x10\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 43f\n"
     "14:"  // Oddments
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "add %x[outptr], %x[outptr], x27\n"
     "movi v15.4s, #0x0\n"
-    "add %x[outptr], %x[outptr], x26\n"
     "movi v14.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
     "movi v12.4s, #0x0\n"
-    "cbz x22, 24f\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x23, 24f\n"
     "15:"  // Oddments: 2 inputs loop
+    "ldp x21, x20, [x22, #0x0]\n"
+    "add x22, x22, #0x10\n"
+    "add x21, x21, x27\n"
     "movi v31.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
+    "add x20, x20, x27\n"
     "movi v30.16b, #0x0\n"
-    "add x21, x21, x26\n"
-    "add x20, x20, x26\n"
     "tbz %x[n_channels], #3, 19f\n"
     "ldr d31, [x21], #0x8\n"
     "ldr d30, [x20], #0x8\n"
@@ -478,21 +480,21 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
     "ldr b31, [x21], #0x1\n"
     "ldr b30, [x20], #0x1\n"
     "23:"  // Oddments: 2 inputs loop: Load: Bit 3: End
-    "saddl v23.8h, v31.8b, v30.8b\n"
-    "subs x22, x22, #0x1\n"
-    "saddl2 v22.8h, v31.16b, v30.16b\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "saddl v17.8h, v31.8b, v30.8b\n"
+    "saddl2 v16.8h, v31.16b, v30.16b\n"
+    "subs x23, x23, #0x1\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 15b\n"
     "24:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 34f\n"
     "25:"  // Oddments: Single input loop
+    "ldr x21, [x22], #0x8\n"
+    "add x21, x21, x27\n"
     "movi v31.16b, #0x0\n"
-    "ldr x21, [x19], #0x8\n"
-    "add x21, x21, x26\n"
     "tbz %x[n_channels], #3, 29f\n"
     "ldr d31, [x21], #0x8\n"
     "tbz %x[n_channels], #2, 27f\n"
@@ -538,38 +540,38 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 33f\n"
     "ldr b31, [x21], #0x1\n"
     "33:"  // Oddments: Single input loop: Load: Bit 3: End
-    "sxtl v23.8h, v31.8b\n"
-    "subs x20, x20, #0x1\n"
-    "sxtl2 v22.8h, v31.16b\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "sxtl v17.8h, v31.8b\n"
+    "sxtl2 v16.8h, v31.16b\n"
+    "subs x23, x23, #0x1\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 25b\n"
     "34:"  // Oddments: Single input loop: End
-    "movi v19.4s, #0x7f\n"
-    "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
-    "not v16.16b, v19.16b\n"
-    "sqdmulh v15.4s, v15.4s, v18.4s\n"
-    "ld1r { v17.4s }, [%x[shift_ptr]]\n"
-    "sqdmulh v14.4s, v14.4s, v18.4s\n"
-    "sqdmulh v13.4s, v13.4s, v18.4s\n"
-    "sqdmulh v12.4s, v12.4s, v18.4s\n"
-    "srshl v15.4s, v15.4s, v17.4s\n"
-    "srshl v14.4s, v14.4s, v17.4s\n"
-    "srshl v13.4s, v13.4s, v17.4s\n"
-    "srshl v12.4s, v12.4s, v17.4s\n"
+    "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
+    "ld1r { v16.4s }, [%x[shift_ptr]]\n"
+    "sqdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqdmulh v14.4s, v14.4s, v17.4s\n"
+    "sqdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqdmulh v12.4s, v12.4s, v17.4s\n"
+    "movi v17.4s, #0x7f\n"
+    "srshl v15.4s, v15.4s, v16.4s\n"
+    "srshl v14.4s, v14.4s, v16.4s\n"
+    "srshl v13.4s, v13.4s, v16.4s\n"
+    "srshl v12.4s, v12.4s, v16.4s\n"
+    "not v16.16b, v17.16b\n"
     "smax v15.4s, v15.4s, v16.4s\n"
     "smax v14.4s, v14.4s, v16.4s\n"
     "smax v13.4s, v13.4s, v16.4s\n"
     "smax v12.4s, v12.4s, v16.4s\n"
-    "smin v15.4s, v15.4s, v19.4s\n"
-    "smin v14.4s, v14.4s, v19.4s\n"
-    "smin v13.4s, v13.4s, v19.4s\n"
-    "smin v12.4s, v12.4s, v19.4s\n"
-    "uzp1 v23.16b, v15.16b, v14.16b\n"
+    "smin v15.4s, v15.4s, v17.4s\n"
+    "smin v14.4s, v14.4s, v17.4s\n"
+    "smin v13.4s, v13.4s, v17.4s\n"
+    "smin v12.4s, v12.4s, v17.4s\n"
+    "uzp1 v17.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
     "tbz %x[n_channels], #3, 38f\n"
     "st1 { v16.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #2, 36f\n"
@@ -615,12 +617,10 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 42f\n"
     "st1 { v16.b }[0], [%x[outptr]], #0x1\n"
     "42:"  // Oddments: Store: Bit 3: End
-
     "43:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 7829ecc0e9..f8f1134866 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,33 +24,28 @@
 
 #pragma once
 
+#if defined(__aarch64__)
+
 namespace arm_conv {
 namespace pooling {
 
 void a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const int8_t *const *const, int8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst
+struct a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<int8_t, int8_t>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(unsigned int, const int8_t *const *const, int8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+  using Parent = DepthfirstStrategy<int8_t, int8_t>;
 
-  constexpr static unsigned int pool_rows(void) { return 2; }
-  constexpr static unsigned int pool_cols(void) { return 2; }
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
+  a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
-  a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 298db96861..7e62ac1afc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 #include <cstddef>
 #include <cstdint>
 
+#if defined(__aarch64__)
+
 namespace arm_conv {
 namespace pooling {
 
@@ -61,114 +63,115 @@ void a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
-    "mov x14, #0x0\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x13, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "cmp x15, #0x10\n"
-    "ldp x12, x11, [x20, #0x0]\n"
-    "ldp x10, x9, [x20, #0x10]\n"
-    "ldp x28, x27, [x19, #0x0]\n"
-    "ldp x26, x25, [x19, #0x10]\n"
-    "ldp x24, x23, [x19, #0x20]\n"
-    "ldp x22, x21, [x19, #0x30]\n"
-    "ldr x20, [x19, #0x40]\n"
+    "ldr x16, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "cmp x16, #0x10\n"
+    "mov x15, #0x0\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x14, x13, [x21, #0x0]\n"
+    "mov x12, #0x0\n"
+    "ldp x11, x10, [x21, #0x10]\n"
+    "ldp x9, x28, [x20, #0x0]\n"
+    "ldp x27, x26, [x20, #0x10]\n"
+    "ldp x25, x24, [x20, #0x20]\n"
+    "ldp x23, x22, [x20, #0x30]\n"
+    "ldr x21, [x20, #0x40]\n"
     "blt 3f\n"
-    "ldr q30, [x27, x14]\n"
-    "lsr x19, x15, #0x4\n"
-    "ldr q29, [x24, x14]\n"
-    "sub x15, x15, x19, LSL #4\n"
-    "ldr q28, [x21, x14]\n"
-    "subs x19, x19, #0x1\n"
-    "ldr q27, [x25, x14]\n"
-    "ldr q26, [x28, x14]\n"
-    "ldr q25, [x23, x14]\n"
-    "ldr q24, [x26, x14]\n"
-    "ldr q23, [x22, x14]\n"
-    "ldr q22, [x20, x14]\n"
-    "add x14, x14, #0x10\n"
+    "ldr q30, [x28, x15]\n"
+    "ldr q29, [x25, x15]\n"
+    "lsr x20, x16, #0x4\n"
+    "sub x16, x16, x20, LSL #4\n"
+    "ldr q28, [x22, x15]\n"
+    "ldr q27, [x26, x15]\n"
+    "subs x20, x20, #0x1\n"
+    "ldr q26, [x9, x15]\n"
+    "ldr q25, [x27, x15]\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr q23, [x23, x15]\n"
+    "ldr q22, [x21, x15]\n"
+    "add x15, x15, #0x10\n"
     "beq 2f\n"
     "1:"  // Vector: Loop
     "smax v21.16b, v30.16b, v29.16b\n"
-    "ldr q30, [x27, x14]\n"
-    "subs x19, x19, #0x1\n"
+    "ldr q30, [x28, x15]\n"
     "smax v20.16b, v29.16b, v28.16b\n"
-    "ldr q29, [x24, x14]\n"
+    "ldr q29, [x25, x15]\n"
+    "ldr q28, [x22, x15]\n"
     "smax v19.16b, v27.16b, v26.16b\n"
-    "ldr q28, [x21, x14]\n"
+    "ldr q26, [x9, x15]\n"
     "smax v18.16b, v25.16b, v24.16b\n"
-    "ldr q26, [x28, x14]\n"
-    "smax v17.16b, v23.16b, v27.16b\n"
-    "ldr q27, [x25, x14]\n"
-    "smax v16.16b, v25.16b, v22.16b\n"
-    "ldr q25, [x23, x14]\n"
+    "ldr q25, [x27, x15]\n"
+    "smax v17.16b, v27.16b, v23.16b\n"
+    "ldr q27, [x26, x15]\n"
+    "smax v16.16b, v24.16b, v22.16b\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr q23, [x23, x15]\n"
+    "subs x20, x20, #0x1\n"
     "smax v19.16b, v21.16b, v19.16b\n"
-    "ldr q24, [x26, x14]\n"
-    "smax v18.16b, v21.16b, v18.16b\n"
-    "ldr q23, [x22, x14]\n"
-    "smax v17.16b, v20.16b, v17.16b\n"
-    "ldr q22, [x20, x14]\n"
-    "add x14, x14, #0x10\n"
+    "ldr q22, [x21, x15]\n"
+    "smax v18.16b, v18.16b, v21.16b\n"
+    "smax v17.16b, v17.16b, v20.16b\n"
+    "add x15, x15, #0x10\n"
     "smax v16.16b, v20.16b, v16.16b\n"
-    "str q19, [x12, x13]\n"
-    "str q18, [x11, x13]\n"
-    "str q17, [x10, x13]\n"
-    "str q16, [x9, x13]\n"
-    "add x13, x13, #0x10\n"
+    "str q19, [x14, x12]\n"
+    "str q18, [x13, x12]\n"
+    "str q17, [x11, x12]\n"
+    "str q16, [x10, x12]\n"
+    "add x12, x12, #0x10\n"
     "bgt 1b\n"
     "2:"  // Vector: Tail
     "smax v21.16b, v30.16b, v29.16b\n"
     "smax v20.16b, v29.16b, v28.16b\n"
-    "smax v19.16b, v27.16b, v26.16b\n"
+    "smax v16.16b, v27.16b, v26.16b\n"
     "smax v18.16b, v25.16b, v24.16b\n"
-    "smax v17.16b, v23.16b, v27.16b\n"
-    "smax v16.16b, v25.16b, v22.16b\n"
-    "smax v19.16b, v21.16b, v19.16b\n"
-    "str q19, [x12, x13]\n"
-    "smax v18.16b, v21.16b, v18.16b\n"
-    "smax v17.16b, v20.16b, v17.16b\n"
-    "str q18, [x11, x13]\n"
-    "smax v16.16b, v20.16b, v16.16b\n"
-    "str q17, [x10, x13]\n"
-    "str q16, [x9, x13]\n"
-    "add x13, x13, #0x10\n"
-    "cbz x15, 4f\n"
+    "smax v17.16b, v27.16b, v23.16b\n"
+    "smax v19.16b, v24.16b, v22.16b\n"
+    "smax v16.16b, v21.16b, v16.16b\n"
+    "smax v18.16b, v18.16b, v21.16b\n"
+    "str q16, [x14, x12]\n"
+    "smax v17.16b, v17.16b, v20.16b\n"
+    "smax v16.16b, v20.16b, v19.16b\n"
+    "str q18, [x13, x12]\n"
+    "str q17, [x11, x12]\n"
+    "str q16, [x10, x12]\n"
+    "add x12, x12, #0x10\n"
+    "cbz x16, 4f\n"
     "3:"  // Oddments
-    "ldr b30, [x27, x14]\n"
-    "subs x15, x15, #0x1\n"
-    "ldr b29, [x24, x14]\n"
-    "smax v21.16b, v30.16b, v29.16b\n"
-    "ldr b28, [x21, x14]\n"
-    "ldr b27, [x25, x14]\n"
-    "smax v20.16b, v29.16b, v28.16b\n"
-    "ldr b26, [x28, x14]\n"
-    "ldr b25, [x23, x14]\n"
-    "smax v19.16b, v27.16b, v26.16b\n"
-    "ldr b24, [x26, x14]\n"
-    "ldr b23, [x22, x14]\n"
-    "smax v19.16b, v21.16b, v19.16b\n"
-    "ldr b22, [x20, x14]\n"
-    "add x14, x14, #0x1\n"
-    "smax v18.16b, v25.16b, v24.16b\n"
-    "str b19, [x12, x13]\n"
-    "smax v17.16b, v23.16b, v27.16b\n"
-    "smax v16.16b, v25.16b, v22.16b\n"
-    "smax v18.16b, v21.16b, v18.16b\n"
-    "str b18, [x11, x13]\n"
-    "smax v17.16b, v20.16b, v17.16b\n"
-    "smax v16.16b, v20.16b, v16.16b\n"
-    "str b17, [x10, x13]\n"
-    "str b16, [x9, x13]\n"
-    "add x13, x13, #0x1\n"
+    "ldr b16, [x28, x15]\n"
+    "ldr b17, [x25, x15]\n"
+    "smax v23.16b, v16.16b, v17.16b\n"
+    "subs x16, x16, #0x1\n"
+    "ldr b16, [x22, x15]\n"
+    "ldr b22, [x26, x15]\n"
+    "smax v21.16b, v17.16b, v16.16b\n"
+    "ldr b16, [x9, x15]\n"
+    "ldr b17, [x27, x15]\n"
+    "smax v16.16b, v22.16b, v16.16b\n"
+    "smax v20.16b, v23.16b, v16.16b\n"
+    "ldr b19, [x24, x15]\n"
+    "ldr b16, [x23, x15]\n"
+    "smax v18.16b, v17.16b, v19.16b\n"
+    "smax v17.16b, v22.16b, v16.16b\n"
+    "ldr b16, [x21, x15]\n"
+    "smax v16.16b, v19.16b, v16.16b\n"
+    "add x15, x15, #0x1\n"
+    "smax v18.16b, v18.16b, v23.16b\n"
+    "smax v17.16b, v17.16b, v21.16b\n"
+    "smax v16.16b, v21.16b, v16.16b\n"
+    "str b20, [x14, x12]\n"
+    "str b18, [x13, x12]\n"
+    "str b17, [x11, x12]\n"
+    "str b16, [x10, x12]\n"
+    "add x12, x12, #0x1\n"
     "bgt 3b\n"
     "4:"  // End
-
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
-    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst.hpp
index 6c4cd1467f..ba6d52f570 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_s8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
 
-struct a64_s8_nhwc_max_generic_depthfirst
+struct a64_s8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = a64_s8_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
   a64_s8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_s8_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
index 5e4c84d23e..411fd11460 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
 #if defined(__aarch64__)
 
@@ -39,397 +40,395 @@ void a64_s8_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "mov x28, #0x0\n"
-    "mov x27, #0x10\n" // cntb _, ALL, #1
-    "mov x26, #0x20\n" // cntb _, ALL, #2
-    "mov x25, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x40\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x24, #0x20\n"  // cntb _, ALL, #2
+    "mov x23, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x80\n"
     "movi v7.16b, #0x80\n"
-    "mov x19, %x[inptrs]\n"
+    "mov x22, %x[inptrs]\n"
     "movi v6.16b, #0x80\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "movi v5.16b, #0x80\n"
-    "movi v4.16b, #0x80\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q30, [x22, x27]\n"
-    "ldr q29, [x21, x27]\n"
-    "ldr q28, [x20, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "ldr q21, [x22, x26]\n"
-    "ldr q26, [x21, x26]\n"
-    "ldr q17, [x20, x26]\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "cbz x25, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldr q2, [x21, x26]\n"
+    "ldr q1, [x20, x26]\n"
+    "ldr q0, [x21, x24]\n"
+    "ldr q31, [x20, x24]\n"
+    "ldr q30, [x21, x23]\n"
+    "ldr q29, [x20, x23]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "smax v22.16b, v31.16b, v30.16b\n"
-    "ldr q3, [x23, x28]\n"
-    "smax v18.16b, v29.16b, v28.16b\n"
-    "smax v21.16b, v27.16b, v21.16b\n"
-    "ldr q2, [x22, x28]\n"
-    "smax v17.16b, v26.16b, v17.16b\n"
-    "ldr q1, [x21, x28]\n"
-    "smax v20.16b, v25.16b, v20.16b\n"
-    "ldr q0, [x20, x28]\n"
-    "smax v16.16b, v24.16b, v16.16b\n"
-    "ldr q31, [x23, x27]\n"
+    "smax v23.16b, v4.16b, v3.16b\n"
+    "smax v19.16b, v28.16b, v22.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "smax v22.16b, v2.16b, v1.16b\n"
+    "ldr q2, [x21, x26]\n"
+    "smax v18.16b, v27.16b, v21.16b\n"
+    "ldr q1, [x20, x26]\n"
+    "smax v21.16b, v0.16b, v31.16b\n"
+    "ldr q0, [x21, x24]\n"
+    "smax v17.16b, v26.16b, v20.16b\n"
+    "ldr q31, [x20, x24]\n"
+    "smax v20.16b, v30.16b, v29.16b\n"
+    "ldr q30, [x21, x23]\n"
+    "smax v16.16b, v25.16b, v24.16b\n"
+    "ldr q29, [x20, x23]\n"
     "smax v19.16b, v23.16b, v19.16b\n"
-    "ldr q30, [x22, x27]\n"
     "smax v18.16b, v22.16b, v18.16b\n"
-    "ldr q29, [x21, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "smax v17.16b, v21.16b, v17.16b\n"
-    "ldr q28, [x20, x27]\n"
     "smax v16.16b, v20.16b, v16.16b\n"
-    "ldr q27, [x23, x26]\n"
-    "smax v7.16b, v7.16b, v19.16b\n"
-    "ldr q21, [x22, x26]\n"
-    "smax v6.16b, v6.16b, v18.16b\n"
-    "ldr q26, [x21, x26]\n"
-    "smax v5.16b, v5.16b, v17.16b\n"
-    "ldr q17, [x20, x26]\n"
-    "smax v4.16b, v4.16b, v16.16b\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "subs x25, x25, #0x1\n"
+    "smax v8.16b, v8.16b, v19.16b\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "smax v7.16b, v7.16b, v18.16b\n"
+    "smax v6.16b, v6.16b, v17.16b\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
+    "smax v5.16b, v5.16b, v16.16b\n"
+    "add x22, x22, #0x20\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "smax v22.16b, v31.16b, v30.16b\n"
-    "smax v18.16b, v29.16b, v28.16b\n"
-    "smax v21.16b, v27.16b, v21.16b\n"
-    "smax v17.16b, v26.16b, v17.16b\n"
-    "smax v20.16b, v25.16b, v20.16b\n"
-    "smax v16.16b, v24.16b, v16.16b\n"
+    "smax v23.16b, v4.16b, v3.16b\n"
+    "smax v19.16b, v28.16b, v22.16b\n"
+    "smax v22.16b, v2.16b, v1.16b\n"
+    "smax v18.16b, v27.16b, v21.16b\n"
+    "smax v21.16b, v0.16b, v31.16b\n"
+    "smax v17.16b, v26.16b, v20.16b\n"
+    "smax v20.16b, v30.16b, v29.16b\n"
+    "smax v16.16b, v25.16b, v24.16b\n"
     "smax v19.16b, v23.16b, v19.16b\n"
     "smax v18.16b, v22.16b, v18.16b\n"
     "smax v17.16b, v21.16b, v17.16b\n"
     "smax v16.16b, v20.16b, v16.16b\n"
-    "smax v7.16b, v7.16b, v19.16b\n"
-    "smax v6.16b, v6.16b, v18.16b\n"
-    "smax v5.16b, v5.16b, v17.16b\n"
-    "smax v4.16b, v4.16b, v16.16b\n"
+    "smax v8.16b, v8.16b, v19.16b\n"
+    "smax v7.16b, v7.16b, v18.16b\n"
+    "smax v6.16b, v6.16b, v17.16b\n"
+    "smax v5.16b, v5.16b, v16.16b\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "smax v7.16b, v7.16b, v3.16b\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "smax v6.16b, v6.16b, v31.16b\n"
-    "ldr q25, [x23, x25]\n"
-    "smax v5.16b, v5.16b, v27.16b\n"
-    "smax v4.16b, v4.16b, v25.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
+    "ldr q17, [x20, x26]\n"
+    "ldr q16, [x20, x24]\n"
+    "smax v7.16b, v7.16b, v17.16b\n"
+    "smax v6.16b, v6.16b, v16.16b\n"
+    "ldr q16, [x20, x23]\n"
+    "smax v5.16b, v5.16b, v16.16b\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x40\n"
-    "str q6, [%x[outptr], x27]\n"
-    "add x27, x27, #0x40\n"
-    "str q5, [%x[outptr], x26]\n"
-    "add x26, x26, #0x40\n"
-    "str q4, [%x[outptr], x25]\n"
-    "add x25, x25, #0x40\n"
     "sub %x[n_channels], %x[n_channels], #0x40\n"
     "cmp %x[n_channels], #0x40\n"
+    "str q8, [%x[outptr], x27]\n"
+    "str q7, [%x[outptr], x26]\n"
+    "add x27, x27, #0x40\n"
+    "add x26, x26, #0x40\n"
+    "str q6, [%x[outptr], x24]\n"
+    "add x24, x24, #0x40\n"
+    "str q5, [%x[outptr], x23]\n"
+    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 43f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x10\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
-    "movi v7.16b, #0x80\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x80\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "smax v19.16b, v23.16b, v19.16b\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "smax v7.16b, v7.16b, v19.16b\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "smax v17.16b, v4.16b, v3.16b\n"
+    "smax v16.16b, v28.16b, v22.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "smax v16.16b, v17.16b, v16.16b\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
+    "add x22, x22, #0x20\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "smax v19.16b, v23.16b, v19.16b\n"
-    "smax v7.16b, v7.16b, v19.16b\n"
+    "smax v17.16b, v4.16b, v3.16b\n"
+    "smax v16.16b, v28.16b, v22.16b\n"
+    "smax v16.16b, v17.16b, v16.16b\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "smax v7.16b, v7.16b, v3.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x10\n"
     "sub %x[n_channels], %x[n_channels], #0x10\n"
     "cmp %x[n_channels], #0x10\n"
+    "str q8, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 43f\n"
     "14:"  // Oddments
-    "movi v7.16b, #0x80\n"
-    "add %x[outptr], %x[outptr], x28\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 24f\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "add %x[outptr], %x[outptr], x27\n"
+    "movi v8.16b, #0x80\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 24f\n"
     "15:"  // Oddments: 4 inputs loop
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "add x21, x21, x27\n"
+    "movi v4.16b, #0x0\n"
     "movi v3.16b, #0x0\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "add x23, x23, x28\n"
-    "movi v2.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "movi v1.16b, #0x0\n"
-    "add x19, x19, #0x20\n"
-    "movi v0.16b, #0x0\n"
-    "add x22, x22, x28\n"
-    "add x21, x21, x28\n"
-    "add x20, x20, x28\n"
+    "add x20, x20, x27\n"
+    "movi v28.16b, #0x0\n"
+    "movi v22.16b, #0x0\n"
     "tbz %x[n_channels], #3, 19f\n"
-    "ldr d3, [x23], #0x8\n"
-    "ldr d2, [x22], #0x8\n"
-    "ldr d1, [x21], #0x8\n"
-    "ldr d0, [x20], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
+    "ldr d3, [x22], #0x8\n"
+    "ldr d28, [x21], #0x8\n"
+    "ldr d22, [x20], #0x8\n"
     "tbz %x[n_channels], #2, 17f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
-    "ld1 { v2.s }[2], [x22], #0x4\n"
-    "ld1 { v1.s }[2], [x21], #0x4\n"
-    "ld1 { v0.s }[2], [x20], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
+    "ld1 { v3.s }[2], [x22], #0x4\n"
+    "ld1 { v28.s }[2], [x21], #0x4\n"
+    "ld1 { v22.s }[2], [x20], #0x4\n"
     "tbz %x[n_channels], #1, 16f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
-    "ld1 { v2.h }[6], [x22], #0x2\n"
-    "ld1 { v1.h }[6], [x21], #0x2\n"
-    "ld1 { v0.h }[6], [x20], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
+    "ld1 { v3.h }[6], [x22], #0x2\n"
+    "ld1 { v28.h }[6], [x21], #0x2\n"
+    "ld1 { v22.h }[6], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[14], [x23], #0x1\n"
-    "ld1 { v2.b }[14], [x22], #0x1\n"
-    "ld1 { v1.b }[14], [x21], #0x1\n"
-    "ld1 { v0.b }[14], [x20], #0x1\n"
+    "ld1 { v4.b }[14], [x23], #0x1\n"
+    "ld1 { v3.b }[14], [x22], #0x1\n"
+    "ld1 { v28.b }[14], [x21], #0x1\n"
+    "ld1 { v22.b }[14], [x20], #0x1\n"
     "b 23f\n"
     "16:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[12], [x23], #0x1\n"
-    "ld1 { v2.b }[12], [x22], #0x1\n"
-    "ld1 { v1.b }[12], [x21], #0x1\n"
-    "ld1 { v0.b }[12], [x20], #0x1\n"
+    "ld1 { v4.b }[12], [x23], #0x1\n"
+    "ld1 { v3.b }[12], [x22], #0x1\n"
+    "ld1 { v28.b }[12], [x21], #0x1\n"
+    "ld1 { v22.b }[12], [x20], #0x1\n"
     "b 23f\n"
     "17:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 18f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
-    "ld1 { v2.h }[4], [x22], #0x2\n"
-    "ld1 { v1.h }[4], [x21], #0x2\n"
-    "ld1 { v0.h }[4], [x20], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
+    "ld1 { v3.h }[4], [x22], #0x2\n"
+    "ld1 { v28.h }[4], [x21], #0x2\n"
+    "ld1 { v22.h }[4], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[10], [x23], #0x1\n"
-    "ld1 { v2.b }[10], [x22], #0x1\n"
-    "ld1 { v1.b }[10], [x21], #0x1\n"
-    "ld1 { v0.b }[10], [x20], #0x1\n"
+    "ld1 { v4.b }[10], [x23], #0x1\n"
+    "ld1 { v3.b }[10], [x22], #0x1\n"
+    "ld1 { v28.b }[10], [x21], #0x1\n"
+    "ld1 { v22.b }[10], [x20], #0x1\n"
     "b 23f\n"
     "18:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[8], [x23], #0x1\n"
-    "ld1 { v2.b }[8], [x22], #0x1\n"
-    "ld1 { v1.b }[8], [x21], #0x1\n"
-    "ld1 { v0.b }[8], [x20], #0x1\n"
+    "ld1 { v4.b }[8], [x23], #0x1\n"
+    "ld1 { v3.b }[8], [x22], #0x1\n"
+    "ld1 { v28.b }[8], [x21], #0x1\n"
+    "ld1 { v22.b }[8], [x20], #0x1\n"
     "b 23f\n"
     "19:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset
     "tbz %x[n_channels], #2, 21f\n"
-    "ldr s3, [x23], #0x4\n"
-    "ldr s2, [x22], #0x4\n"
-    "ldr s1, [x21], #0x4\n"
-    "ldr s0, [x20], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
+    "ldr s3, [x22], #0x4\n"
+    "ldr s28, [x21], #0x4\n"
+    "ldr s22, [x20], #0x4\n"
     "tbz %x[n_channels], #1, 20f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
-    "ld1 { v2.h }[2], [x22], #0x2\n"
-    "ld1 { v1.h }[2], [x21], #0x2\n"
-    "ld1 { v0.h }[2], [x20], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v3.h }[2], [x22], #0x2\n"
+    "ld1 { v28.h }[2], [x21], #0x2\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[6], [x23], #0x1\n"
-    "ld1 { v2.b }[6], [x22], #0x1\n"
-    "ld1 { v1.b }[6], [x21], #0x1\n"
-    "ld1 { v0.b }[6], [x20], #0x1\n"
+    "ld1 { v4.b }[6], [x23], #0x1\n"
+    "ld1 { v3.b }[6], [x22], #0x1\n"
+    "ld1 { v28.b }[6], [x21], #0x1\n"
+    "ld1 { v22.b }[6], [x20], #0x1\n"
     "b 23f\n"
     "20:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[4], [x23], #0x1\n"
-    "ld1 { v2.b }[4], [x22], #0x1\n"
-    "ld1 { v1.b }[4], [x21], #0x1\n"
-    "ld1 { v0.b }[4], [x20], #0x1\n"
+    "ld1 { v4.b }[4], [x23], #0x1\n"
+    "ld1 { v3.b }[4], [x22], #0x1\n"
+    "ld1 { v28.b }[4], [x21], #0x1\n"
+    "ld1 { v22.b }[4], [x20], #0x1\n"
     "b 23f\n"
     "21:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 22f\n"
-    "ldr h3, [x23], #0x2\n"
-    "ldr h2, [x22], #0x2\n"
-    "ldr h1, [x21], #0x2\n"
-    "ldr h0, [x20], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
+    "ldr h3, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "ldr h22, [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[2], [x23], #0x1\n"
-    "ld1 { v2.b }[2], [x22], #0x1\n"
-    "ld1 { v1.b }[2], [x21], #0x1\n"
-    "ld1 { v0.b }[2], [x20], #0x1\n"
+    "ld1 { v4.b }[2], [x23], #0x1\n"
+    "ld1 { v3.b }[2], [x22], #0x1\n"
+    "ld1 { v28.b }[2], [x21], #0x1\n"
+    "ld1 { v22.b }[2], [x20], #0x1\n"
     "b 23f\n"
     "22:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ldr b3, [x23], #0x1\n"
-    "ldr b2, [x22], #0x1\n"
-    "ldr b1, [x21], #0x1\n"
-    "ldr b0, [x20], #0x1\n"
+    "ldr b4, [x23], #0x1\n"
+    "ldr b3, [x22], #0x1\n"
+    "ldr b28, [x21], #0x1\n"
+    "ldr b22, [x20], #0x1\n"
     "23:"  // Oddments: 4 inputs loop: Load: Bit 3: End
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "subs x24, x24, #0x1\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "smax v19.16b, v23.16b, v19.16b\n"
-    "smax v7.16b, v7.16b, v19.16b\n"
+    "smax v17.16b, v4.16b, v3.16b\n"
+    "smax v16.16b, v28.16b, v22.16b\n"
+    "subs x25, x25, #0x1\n"
+    "smax v16.16b, v17.16b, v16.16b\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
     "bgt 15b\n"
     "24:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 34f\n"
     "25:"  // Oddments: Single input loop
-    "movi v3.16b, #0x0\n"
-    "ldr x23, [x19], #0x8\n"
-    "add x23, x23, x28\n"
+    "ldr x23, [x24], #0x8\n"
+    "add x23, x23, x27\n"
+    "movi v4.16b, #0x0\n"
     "tbz %x[n_channels], #3, 29f\n"
-    "ldr d3, [x23], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
     "tbz %x[n_channels], #2, 27f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
     "tbz %x[n_channels], #1, 26f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[14], [x23], #0x1\n"
+    "ld1 { v4.b }[14], [x23], #0x1\n"
     "b 33f\n"
     "26:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[12], [x23], #0x1\n"
+    "ld1 { v4.b }[12], [x23], #0x1\n"
     "b 33f\n"
     "27:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 28f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[10], [x23], #0x1\n"
+    "ld1 { v4.b }[10], [x23], #0x1\n"
     "b 33f\n"
     "28:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[8], [x23], #0x1\n"
+    "ld1 { v4.b }[8], [x23], #0x1\n"
     "b 33f\n"
     "29:"  // Oddments: Single input loop: Load: Bit 3: Unset
     "tbz %x[n_channels], #2, 31f\n"
-    "ldr s3, [x23], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
     "tbz %x[n_channels], #1, 30f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[6], [x23], #0x1\n"
+    "ld1 { v4.b }[6], [x23], #0x1\n"
     "b 33f\n"
     "30:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[4], [x23], #0x1\n"
+    "ld1 { v4.b }[4], [x23], #0x1\n"
     "b 33f\n"
     "31:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 32f\n"
-    "ldr h3, [x23], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[2], [x23], #0x1\n"
+    "ld1 { v4.b }[2], [x23], #0x1\n"
     "b 33f\n"
     "32:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ldr b3, [x23], #0x1\n"
+    "ldr b4, [x23], #0x1\n"
     "33:"  // Oddments: Single input loop: Load: Bit 3: End
-    "smax v7.16b, v7.16b, v3.16b\n"
-    "subs x20, x20, #0x1\n"
+    "subs x21, x21, #0x1\n"
+    "smax v8.16b, v8.16b, v4.16b\n"
     "bgt 25b\n"
     "34:"  // Oddments: Single input loop: End
     "tbz %x[n_channels], #3, 38f\n"
-    "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+    "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #2, 36f\n"
-    "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
     "tbz %x[n_channels], #1, 35f\n"
-    "st1 { v7.h }[6], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[6], [%x[outptr]], #0x2\n"
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[14], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[14], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "35:"  // Oddments: Store: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[12], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[12], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "36:"  // Oddments: Store: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 37f\n"
-    "st1 { v7.h }[4], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[4], [%x[outptr]], #0x2\n"
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[10], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[10], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "37:"  // Oddments: Store: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[8], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[8], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "38:"  // Oddments: Store: Bit 3: Unset
     "tbz %x[n_channels], #2, 40f\n"
-    "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
     "tbz %x[n_channels], #1, 39f\n"
-    "st1 { v7.h }[2], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[2], [%x[outptr]], #0x2\n"
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[6], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[6], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "39:"  // Oddments: Store: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[4], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[4], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "40:"  // Oddments: Store: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 41f\n"
-    "st1 { v7.h }[0], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[0], [%x[outptr]], #0x2\n"
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[2], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[2], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "41:"  // Oddments: Store: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[0], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[0], [%x[outptr]], #0x1\n"
     "42:"  // Oddments: Store: Bit 3: End
-
     "43:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst.hpp
index a50e99a009..d5d7313a90 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_s8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
 
-struct a64_s8q_nhwc_avg_generic_depthfirst
+struct a64_s8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = a64_s8q_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
   a64_s8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_s8q_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
index f288a4119c..019f402911 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 
 #include "pooling.hpp"
 #include <cstdint>
+#include <cstddef>
 #include <cstring>
 #include <cmath>
 
@@ -86,12 +87,13 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
       f_rescale_value *= 2.0f;
     }
 
-    rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
-    if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
     {
       shift_value++;
-      rescale_value >>= 1;
+      long_rescale_value >>= 1;
     }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
   }
 
   // Combine together the rescale value for the requantization and the scaling
@@ -112,17 +114,17 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
   );
 
   __asm__ __volatile__(
-    "mov x26, #0x0\n"
-    "mov x25, #0x10\n" // cntb _, ALL, #1
-    "mov x24, #0x20\n" // cntb _, ALL, #2
-    "mov x23, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x40\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x25, #0x20\n"  // cntb _, ALL, #2
+    "mov x24, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "movi v15.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v14.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
     "movi v12.4s, #0x0\n"
     "movi v11.4s, #0x0\n"
@@ -137,43 +139,43 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
     "movi v2.4s, #0x0\n"
     "movi v1.4s, #0x0\n"
     "movi v0.4s, #0x0\n"
-    "cbz x22, 4f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ldr q31, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ldr q30, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
-    "ldr q29, [x21, x25]\n"
-    "ldr q28, [x20, x25]\n"
-    "ldr q27, [x21, x24]\n"
-    "ldr q26, [x20, x24]\n"
-    "ldr q25, [x21, x23]\n"
-    "ldr q24, [x20, x23]\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ldr q30, [x20, x27]\n"
+    "ldr q29, [x21, x26]\n"
+    "ldr q28, [x20, x26]\n"
+    "ldr q27, [x21, x25]\n"
+    "ldr q26, [x20, x25]\n"
+    "ldr q25, [x21, x24]\n"
+    "ldr q24, [x20, x24]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 2 inputs loop
     "saddl v23.8h, v31.8b, v30.8b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
     "saddl2 v22.8h, v31.16b, v30.16b\n"
-    "ldr q31, [x21, x26]\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "ldr q30, [x20, x27]\n"
     "saddl v21.8h, v29.8b, v28.8b\n"
-    "subs x22, x22, #0x1\n"
     "saddl2 v20.8h, v29.16b, v28.16b\n"
-    "ldr q30, [x20, x26]\n"
+    "ldr q29, [x21, x26]\n"
+    "ldr q28, [x20, x26]\n"
     "saddl v19.8h, v27.8b, v26.8b\n"
-    "ldr q29, [x21, x25]\n"
     "saddl2 v18.8h, v27.16b, v26.16b\n"
-    "ldr q28, [x20, x25]\n"
+    "ldr q27, [x21, x25]\n"
+    "ldr q26, [x20, x25]\n"
     "saddl v17.8h, v25.8b, v24.8b\n"
-    "ldr q27, [x21, x24]\n"
     "saddl2 v16.8h, v25.16b, v24.16b\n"
-    "ldr q26, [x20, x24]\n"
+    "ldr q25, [x21, x24]\n"
+    "ldr q24, [x20, x24]\n"
+    "subs x23, x23, #0x1\n"
     "saddw v15.4s, v15.4s, v23.4h\n"
-    "ldr q25, [x21, x23]\n"
     "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "ldr q24, [x20, x23]\n"
     "saddw v13.4s, v13.4s, v22.4h\n"
     "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "add x22, x22, #0x10\n"
     "saddw v11.4s, v11.4s, v21.4h\n"
     "saddw2 v10.4s, v10.4s, v21.8h\n"
     "saddw v9.4s, v9.4s, v20.4h\n"
@@ -213,23 +215,23 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
     "saddw v1.4s, v1.4s, v16.4h\n"
     "saddw2 v0.4s, v0.4s, v16.8h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q31, [x21, x26]\n"
-    "sxtl v23.8h, v31.8b\n"
-    "ldr q29, [x21, x25]\n"
-    "sxtl2 v22.8h, v31.16b\n"
-    "ldr q27, [x21, x24]\n"
-    "ldr q25, [x21, x23]\n"
-    "sxtl v21.8h, v29.8b\n"
-    "sxtl2 v20.8h, v29.16b\n"
-    "sxtl v19.8h, v27.8b\n"
-    "sxtl2 v18.8h, v27.16b\n"
-    "sxtl v17.8h, v25.8b\n"
-    "sxtl2 v16.8h, v25.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "sxtl v23.8h, v16.8b\n"
+    "sxtl2 v22.8h, v16.16b\n"
+    "ldr q16, [x20, x26]\n"
+    "ldr q17, [x20, x25]\n"
+    "sxtl v21.8h, v16.8b\n"
+    "sxtl2 v20.8h, v16.16b\n"
+    "ldr q16, [x20, x24]\n"
+    "sxtl v19.8h, v17.8b\n"
+    "sxtl2 v18.8h, v17.16b\n"
+    "subs x23, x23, #0x1\n"
+    "sxtl v17.8h, v16.8b\n"
+    "sxtl2 v16.8h, v16.16b\n"
     "saddw v15.4s, v15.4s, v23.4h\n"
     "saddw2 v14.4s, v14.4s, v23.8h\n"
     "saddw v13.4s, v13.4s, v22.4h\n"
@@ -248,217 +250,217 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
     "saddw2 v0.4s, v0.4s, v16.8h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "movi v20.4s, #0x7f\n"
-    "ld1r { v19.4s }, [%x[combined_rescale_value]]\n"
-    "sub %x[n_channels], %x[n_channels], #0x40\n"
     "ld1r { v18.4s }, [%x[left_shift]]\n"
+    "ld1r { v17.4s }, [%x[combined_rescale_value]]\n"
     "srshl v15.4s, v15.4s, v18.4s\n"
-    "ld1r { v17.4s }, [%x[right_shift]]\n"
-    "not v16.16b, v20.16b\n"
     "srshl v14.4s, v14.4s, v18.4s\n"
-    "cmp %x[n_channels], #0x40\n"
+    "ld1r { v16.4s }, [%x[right_shift]]\n"
     "srshl v13.4s, v13.4s, v18.4s\n"
     "srshl v12.4s, v12.4s, v18.4s\n"
+    "sub %x[n_channels], %x[n_channels], #0x40\n"
     "srshl v11.4s, v11.4s, v18.4s\n"
-    "sqrdmulh v15.4s, v15.4s, v19.4s\n"
-    "sqrdmulh v14.4s, v14.4s, v19.4s\n"
-    "sqrdmulh v13.4s, v13.4s, v19.4s\n"
-    "sqrdmulh v12.4s, v12.4s, v19.4s\n"
-    "srshl v15.4s, v15.4s, v17.4s\n"
-    "srshl v14.4s, v14.4s, v17.4s\n"
-    "srshl v13.4s, v13.4s, v17.4s\n"
-    "srshl v12.4s, v12.4s, v17.4s\n"
-    "sqrdmulh v11.4s, v11.4s, v19.4s\n"
     "srshl v10.4s, v10.4s, v18.4s\n"
+    "cmp %x[n_channels], #0x40\n"
     "srshl v9.4s, v9.4s, v18.4s\n"
     "srshl v8.4s, v8.4s, v18.4s\n"
-    "srshl v11.4s, v11.4s, v17.4s\n"
-    "sqrdmulh v10.4s, v10.4s, v19.4s\n"
-    "sqrdmulh v9.4s, v9.4s, v19.4s\n"
-    "sqrdmulh v8.4s, v8.4s, v19.4s\n"
     "srshl v7.4s, v7.4s, v18.4s\n"
-    "srshl v10.4s, v10.4s, v17.4s\n"
-    "srshl v9.4s, v9.4s, v17.4s\n"
-    "srshl v8.4s, v8.4s, v17.4s\n"
-    "sqrdmulh v7.4s, v7.4s, v19.4s\n"
     "srshl v6.4s, v6.4s, v18.4s\n"
     "srshl v5.4s, v5.4s, v18.4s\n"
     "srshl v4.4s, v4.4s, v18.4s\n"
-    "srshl v7.4s, v7.4s, v17.4s\n"
-    "sqrdmulh v6.4s, v6.4s, v19.4s\n"
-    "sqrdmulh v5.4s, v5.4s, v19.4s\n"
-    "sqrdmulh v4.4s, v4.4s, v19.4s\n"
     "srshl v3.4s, v3.4s, v18.4s\n"
-    "srshl v6.4s, v6.4s, v17.4s\n"
-    "srshl v5.4s, v5.4s, v17.4s\n"
-    "srshl v4.4s, v4.4s, v17.4s\n"
-    "sqrdmulh v3.4s, v3.4s, v19.4s\n"
     "srshl v2.4s, v2.4s, v18.4s\n"
     "srshl v1.4s, v1.4s, v18.4s\n"
     "srshl v0.4s, v0.4s, v18.4s\n"
-    "srshl v3.4s, v3.4s, v17.4s\n"
-    "sqrdmulh v2.4s, v2.4s, v19.4s\n"
-    "sqrdmulh v1.4s, v1.4s, v19.4s\n"
-    "sqrdmulh v0.4s, v0.4s, v19.4s\n"
+    "sqrdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqrdmulh v14.4s, v14.4s, v17.4s\n"
+    "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqrdmulh v12.4s, v12.4s, v17.4s\n"
+    "sqrdmulh v11.4s, v11.4s, v17.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v17.4s\n"
+    "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+    "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+    "sqrdmulh v7.4s, v7.4s, v17.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v17.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v17.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+    "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+    "sqrdmulh v2.4s, v2.4s, v17.4s\n"
+    "sqrdmulh v1.4s, v1.4s, v17.4s\n"
+    "sqrdmulh v0.4s, v0.4s, v17.4s\n"
+    "movi v17.4s, #0x7f\n"
+    "srshl v15.4s, v15.4s, v16.4s\n"
+    "srshl v14.4s, v14.4s, v16.4s\n"
+    "srshl v13.4s, v13.4s, v16.4s\n"
+    "srshl v12.4s, v12.4s, v16.4s\n"
+    "srshl v11.4s, v11.4s, v16.4s\n"
+    "srshl v10.4s, v10.4s, v16.4s\n"
+    "srshl v9.4s, v9.4s, v16.4s\n"
+    "srshl v8.4s, v8.4s, v16.4s\n"
+    "srshl v7.4s, v7.4s, v16.4s\n"
+    "srshl v6.4s, v6.4s, v16.4s\n"
+    "srshl v5.4s, v5.4s, v16.4s\n"
+    "srshl v4.4s, v4.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v16.4s\n"
+    "srshl v2.4s, v2.4s, v16.4s\n"
+    "srshl v1.4s, v1.4s, v16.4s\n"
+    "srshl v0.4s, v0.4s, v16.4s\n"
+    "not v16.16b, v17.16b\n"
     "smax v15.4s, v15.4s, v16.4s\n"
-    "srshl v2.4s, v2.4s, v17.4s\n"
-    "srshl v1.4s, v1.4s, v17.4s\n"
-    "srshl v0.4s, v0.4s, v17.4s\n"
-    "smin v15.4s, v15.4s, v20.4s\n"
     "smax v14.4s, v14.4s, v16.4s\n"
     "smax v13.4s, v13.4s, v16.4s\n"
     "smax v12.4s, v12.4s, v16.4s\n"
-    "smin v14.4s, v14.4s, v20.4s\n"
-    "smin v13.4s, v13.4s, v20.4s\n"
-    "smin v12.4s, v12.4s, v20.4s\n"
     "smax v11.4s, v11.4s, v16.4s\n"
     "smax v10.4s, v10.4s, v16.4s\n"
     "smax v9.4s, v9.4s, v16.4s\n"
-    "smin v11.4s, v11.4s, v20.4s\n"
-    "smin v10.4s, v10.4s, v20.4s\n"
-    "smin v9.4s, v9.4s, v20.4s\n"
     "smax v8.4s, v8.4s, v16.4s\n"
     "smax v7.4s, v7.4s, v16.4s\n"
     "smax v6.4s, v6.4s, v16.4s\n"
-    "smin v8.4s, v8.4s, v20.4s\n"
-    "smin v7.4s, v7.4s, v20.4s\n"
-    "smin v6.4s, v6.4s, v20.4s\n"
     "smax v5.4s, v5.4s, v16.4s\n"
     "smax v4.4s, v4.4s, v16.4s\n"
     "smax v3.4s, v3.4s, v16.4s\n"
-    "smin v5.4s, v5.4s, v20.4s\n"
-    "smin v4.4s, v4.4s, v20.4s\n"
-    "smin v3.4s, v3.4s, v20.4s\n"
     "smax v2.4s, v2.4s, v16.4s\n"
     "smax v1.4s, v1.4s, v16.4s\n"
     "smax v0.4s, v0.4s, v16.4s\n"
-    "smin v2.4s, v2.4s, v20.4s\n"
-    "smin v1.4s, v1.4s, v20.4s\n"
-    "smin v0.4s, v0.4s, v20.4s\n"
+    "smin v15.4s, v15.4s, v17.4s\n"
+    "smin v14.4s, v14.4s, v17.4s\n"
+    "smin v13.4s, v13.4s, v17.4s\n"
+    "smin v12.4s, v12.4s, v17.4s\n"
+    "smin v11.4s, v11.4s, v17.4s\n"
+    "smin v10.4s, v10.4s, v17.4s\n"
+    "smin v9.4s, v9.4s, v17.4s\n"
+    "smin v8.4s, v8.4s, v17.4s\n"
+    "smin v7.4s, v7.4s, v17.4s\n"
+    "smin v6.4s, v6.4s, v17.4s\n"
+    "smin v5.4s, v5.4s, v17.4s\n"
+    "smin v4.4s, v4.4s, v17.4s\n"
+    "smin v3.4s, v3.4s, v17.4s\n"
+    "smin v2.4s, v2.4s, v17.4s\n"
+    "smin v1.4s, v1.4s, v17.4s\n"
+    "smin v0.4s, v0.4s, v17.4s\n"
     "uzp1 v23.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
     "uzp1 v22.16b, v11.16b, v10.16b\n"
-    "uzp1 v21.16b, v9.16b, v8.16b\n"
-    "uzp1 v20.16b, v7.16b, v6.16b\n"
+    "uzp1 v18.16b, v9.16b, v8.16b\n"
+    "uzp1 v21.16b, v7.16b, v6.16b\n"
     "uzp1 v17.16b, v5.16b, v4.16b\n"
-    "uzp1 v19.16b, v3.16b, v2.16b\n"
-    "uzp1 v18.16b, v1.16b, v0.16b\n"
+    "uzp1 v20.16b, v3.16b, v2.16b\n"
+    "uzp1 v19.16b, v1.16b, v0.16b\n"
     "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x26]\n"
-    "uzp1 v16.16b, v22.16b, v21.16b\n"
+    "uzp1 v18.16b, v22.16b, v18.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x40\n"
+    "uzp1 v17.16b, v21.16b, v17.16b\n"
+    "uzp1 v16.16b, v20.16b, v19.16b\n"
+    "str q18, [%x[outptr], x26]\n"
     "add x26, x26, #0x40\n"
-    "uzp1 v17.16b, v20.16b, v17.16b\n"
-    "str q16, [%x[outptr], x25]\n"
-    "uzp1 v16.16b, v19.16b, v18.16b\n"
+    "str q17, [%x[outptr], x25]\n"
     "add x25, x25, #0x40\n"
-    "str q17, [%x[outptr], x24]\n"
+    "str q16, [%x[outptr], x24]\n"
     "add x24, x24, #0x40\n"
-    "str q16, [%x[outptr], x23]\n"
-    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 43f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x10\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "movi v15.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v14.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
     "movi v12.4s, #0x0\n"
-    "cbz x22, 11f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ldr q31, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ldr q30, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ldr q30, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 2 inputs loop
-    "saddl v23.8h, v31.8b, v30.8b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
-    "saddl2 v22.8h, v31.16b, v30.16b\n"
-    "ldr q31, [x21, x26]\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "subs x22, x22, #0x1\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "ldr q30, [x20, x26]\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "saddl v17.8h, v31.8b, v30.8b\n"
+    "saddl2 v16.8h, v31.16b, v30.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "ldr q30, [x20, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
+    "add x22, x22, #0x10\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 2 inputs tail
-    "saddl v23.8h, v31.8b, v30.8b\n"
-    "saddl2 v22.8h, v31.16b, v30.16b\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "saddl v17.8h, v31.8b, v30.8b\n"
+    "saddl2 v16.8h, v31.16b, v30.16b\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q31, [x21, x26]\n"
-    "sxtl v23.8h, v31.8b\n"
-    "sxtl2 v22.8h, v31.16b\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "sxtl v17.8h, v16.8b\n"
+    "sxtl2 v16.8h, v16.16b\n"
+    "subs x23, x23, #0x1\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "movi v20.4s, #0x7f\n"
-    "ld1r { v19.4s }, [%x[combined_rescale_value]]\n"
-    "sub %x[n_channels], %x[n_channels], #0x10\n"
     "ld1r { v18.4s }, [%x[left_shift]]\n"
+    "ld1r { v17.4s }, [%x[combined_rescale_value]]\n"
     "srshl v15.4s, v15.4s, v18.4s\n"
-    "ld1r { v17.4s }, [%x[right_shift]]\n"
-    "not v16.16b, v20.16b\n"
     "srshl v14.4s, v14.4s, v18.4s\n"
-    "cmp %x[n_channels], #0x10\n"
+    "ld1r { v16.4s }, [%x[right_shift]]\n"
     "srshl v13.4s, v13.4s, v18.4s\n"
     "srshl v12.4s, v12.4s, v18.4s\n"
-    "sqrdmulh v15.4s, v15.4s, v19.4s\n"
-    "sqrdmulh v14.4s, v14.4s, v19.4s\n"
-    "sqrdmulh v13.4s, v13.4s, v19.4s\n"
-    "sqrdmulh v12.4s, v12.4s, v19.4s\n"
-    "srshl v15.4s, v15.4s, v17.4s\n"
-    "srshl v14.4s, v14.4s, v17.4s\n"
-    "srshl v13.4s, v13.4s, v17.4s\n"
-    "srshl v12.4s, v12.4s, v17.4s\n"
+    "sub %x[n_channels], %x[n_channels], #0x10\n"
+    "sqrdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqrdmulh v14.4s, v14.4s, v17.4s\n"
+    "cmp %x[n_channels], #0x10\n"
+    "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqrdmulh v12.4s, v12.4s, v17.4s\n"
+    "movi v17.4s, #0x7f\n"
+    "srshl v15.4s, v15.4s, v16.4s\n"
+    "srshl v14.4s, v14.4s, v16.4s\n"
+    "srshl v13.4s, v13.4s, v16.4s\n"
+    "srshl v12.4s, v12.4s, v16.4s\n"
+    "not v16.16b, v17.16b\n"
     "smax v15.4s, v15.4s, v16.4s\n"
     "smax v14.4s, v14.4s, v16.4s\n"
     "smax v13.4s, v13.4s, v16.4s\n"
     "smax v12.4s, v12.4s, v16.4s\n"
-    "smin v15.4s, v15.4s, v20.4s\n"
-    "smin v14.4s, v14.4s, v20.4s\n"
-    "smin v13.4s, v13.4s, v20.4s\n"
-    "smin v12.4s, v12.4s, v20.4s\n"
-    "uzp1 v23.16b, v15.16b, v14.16b\n"
+    "smin v15.4s, v15.4s, v17.4s\n"
+    "smin v14.4s, v14.4s, v17.4s\n"
+    "smin v13.4s, v13.4s, v17.4s\n"
+    "smin v12.4s, v12.4s, v17.4s\n"
+    "uzp1 v17.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x26]\n"
-    "add x26, x26, #0x10\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 43f\n"
     "14:"  // Oddments
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "add %x[outptr], %x[outptr], x27\n"
     "movi v15.4s, #0x0\n"
-    "add %x[outptr], %x[outptr], x26\n"
     "movi v14.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
     "movi v12.4s, #0x0\n"
-    "cbz x22, 24f\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x23, 24f\n"
     "15:"  // Oddments: 2 inputs loop
+    "ldp x21, x20, [x22, #0x0]\n"
+    "add x22, x22, #0x10\n"
+    "add x21, x21, x27\n"
     "movi v31.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
+    "add x20, x20, x27\n"
     "movi v30.16b, #0x0\n"
-    "add x21, x21, x26\n"
-    "add x20, x20, x26\n"
     "tbz %x[n_channels], #3, 19f\n"
     "ldr d31, [x21], #0x8\n"
     "ldr d30, [x20], #0x8\n"
@@ -519,21 +521,21 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
     "ldr b31, [x21], #0x1\n"
     "ldr b30, [x20], #0x1\n"
     "23:"  // Oddments: 2 inputs loop: Load: Bit 3: End
-    "saddl v23.8h, v31.8b, v30.8b\n"
-    "subs x22, x22, #0x1\n"
-    "saddl2 v22.8h, v31.16b, v30.16b\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "saddl v17.8h, v31.8b, v30.8b\n"
+    "saddl2 v16.8h, v31.16b, v30.16b\n"
+    "subs x23, x23, #0x1\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 15b\n"
     "24:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 34f\n"
     "25:"  // Oddments: Single input loop
+    "ldr x21, [x22], #0x8\n"
+    "add x21, x21, x27\n"
     "movi v31.16b, #0x0\n"
-    "ldr x21, [x19], #0x8\n"
-    "add x21, x21, x26\n"
     "tbz %x[n_channels], #3, 29f\n"
     "ldr d31, [x21], #0x8\n"
     "tbz %x[n_channels], #2, 27f\n"
@@ -579,43 +581,43 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 33f\n"
     "ldr b31, [x21], #0x1\n"
     "33:"  // Oddments: Single input loop: Load: Bit 3: End
-    "sxtl v23.8h, v31.8b\n"
-    "subs x20, x20, #0x1\n"
-    "sxtl2 v22.8h, v31.16b\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "sxtl v17.8h, v31.8b\n"
+    "sxtl2 v16.8h, v31.16b\n"
+    "subs x23, x23, #0x1\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 25b\n"
     "34:"  // Oddments: Single input loop: End
-    "movi v20.4s, #0x7f\n"
-    "ld1r { v19.4s }, [%x[combined_rescale_value]]\n"
-    "not v16.16b, v20.16b\n"
     "ld1r { v18.4s }, [%x[left_shift]]\n"
+    "ld1r { v17.4s }, [%x[combined_rescale_value]]\n"
     "srshl v15.4s, v15.4s, v18.4s\n"
-    "ld1r { v17.4s }, [%x[right_shift]]\n"
     "srshl v14.4s, v14.4s, v18.4s\n"
+    "ld1r { v16.4s }, [%x[right_shift]]\n"
     "srshl v13.4s, v13.4s, v18.4s\n"
     "srshl v12.4s, v12.4s, v18.4s\n"
-    "sqrdmulh v15.4s, v15.4s, v19.4s\n"
-    "sqrdmulh v14.4s, v14.4s, v19.4s\n"
-    "sqrdmulh v13.4s, v13.4s, v19.4s\n"
-    "sqrdmulh v12.4s, v12.4s, v19.4s\n"
-    "srshl v15.4s, v15.4s, v17.4s\n"
-    "srshl v14.4s, v14.4s, v17.4s\n"
-    "srshl v13.4s, v13.4s, v17.4s\n"
-    "srshl v12.4s, v12.4s, v17.4s\n"
+    "sqrdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqrdmulh v14.4s, v14.4s, v17.4s\n"
+    "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqrdmulh v12.4s, v12.4s, v17.4s\n"
+    "movi v17.4s, #0x7f\n"
+    "srshl v15.4s, v15.4s, v16.4s\n"
+    "srshl v14.4s, v14.4s, v16.4s\n"
+    "srshl v13.4s, v13.4s, v16.4s\n"
+    "srshl v12.4s, v12.4s, v16.4s\n"
+    "not v16.16b, v17.16b\n"
     "smax v15.4s, v15.4s, v16.4s\n"
     "smax v14.4s, v14.4s, v16.4s\n"
     "smax v13.4s, v13.4s, v16.4s\n"
     "smax v12.4s, v12.4s, v16.4s\n"
-    "smin v15.4s, v15.4s, v20.4s\n"
-    "smin v14.4s, v14.4s, v20.4s\n"
-    "smin v13.4s, v13.4s, v20.4s\n"
-    "smin v12.4s, v12.4s, v20.4s\n"
-    "uzp1 v23.16b, v15.16b, v14.16b\n"
+    "smin v15.4s, v15.4s, v17.4s\n"
+    "smin v14.4s, v14.4s, v17.4s\n"
+    "smin v13.4s, v13.4s, v17.4s\n"
+    "smin v12.4s, v12.4s, v17.4s\n"
+    "uzp1 v17.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
     "tbz %x[n_channels], #3, 38f\n"
     "st1 { v16.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #2, 36f\n"
@@ -661,12 +663,10 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 42f\n"
     "st1 { v16.b }[0], [%x[outptr]], #0x1\n"
     "42:"  // Oddments: Store: Bit 3: End
-
     "43:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_valid_cells] "r" (n_valid_cells), [right_shift] "r" (&right_shift)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst.hpp
index ea7f7f89fe..68e7a98d0a 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_s8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
 
-struct a64_s8q_nhwc_max_generic_depthfirst
+struct a64_s8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = a64_s8q_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
   a64_s8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_s8q_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
index a077121991..f7b8dc761c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 
 #include "pooling.hpp"
-#include <cstddef>
 #include <cstdint>
+#include <cstddef>
 
 #if defined(__aarch64__)
 
@@ -42,88 +42,88 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "mov x28, #0x0\n"
-    "mov x27, #0x10\n" // cntb _, ALL, #1
-    "mov x26, #0x20\n" // cntb _, ALL, #2
-    "mov x25, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x40\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x24, #0x20\n"  // cntb _, ALL, #2
+    "mov x23, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
     "movi v8.16b, #0x80\n"
-    "mov x19, %x[inptrs]\n"
     "movi v7.16b, #0x80\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
+    "mov x22, %x[inptrs]\n"
     "movi v6.16b, #0x80\n"
     "movi v5.16b, #0x80\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q30, [x22, x27]\n"
-    "ldr q29, [x21, x27]\n"
-    "ldr q28, [x20, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "ldr q21, [x22, x26]\n"
-    "ldr q26, [x21, x26]\n"
-    "ldr q17, [x20, x26]\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "cbz x25, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldr q2, [x21, x26]\n"
+    "ldr q1, [x20, x26]\n"
+    "ldr q0, [x21, x24]\n"
+    "ldr q31, [x20, x24]\n"
+    "ldr q30, [x21, x23]\n"
+    "ldr q29, [x20, x23]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "smax v22.16b, v31.16b, v30.16b\n"
-    "ldr q3, [x23, x28]\n"
-    "smax v18.16b, v29.16b, v28.16b\n"
-    "smax v21.16b, v27.16b, v21.16b\n"
-    "ldr q2, [x22, x28]\n"
-    "smax v17.16b, v26.16b, v17.16b\n"
-    "ldr q1, [x21, x28]\n"
-    "smax v20.16b, v25.16b, v20.16b\n"
-    "ldr q0, [x20, x28]\n"
-    "smax v16.16b, v24.16b, v16.16b\n"
-    "ldr q31, [x23, x27]\n"
+    "smax v23.16b, v4.16b, v3.16b\n"
+    "smax v19.16b, v28.16b, v22.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "smax v22.16b, v2.16b, v1.16b\n"
+    "ldr q2, [x21, x26]\n"
+    "smax v18.16b, v27.16b, v21.16b\n"
+    "ldr q1, [x20, x26]\n"
+    "smax v21.16b, v0.16b, v31.16b\n"
+    "ldr q0, [x21, x24]\n"
+    "smax v17.16b, v26.16b, v20.16b\n"
+    "ldr q31, [x20, x24]\n"
+    "smax v20.16b, v30.16b, v29.16b\n"
+    "ldr q30, [x21, x23]\n"
+    "smax v16.16b, v25.16b, v24.16b\n"
+    "ldr q29, [x20, x23]\n"
     "smax v19.16b, v23.16b, v19.16b\n"
-    "ldr q30, [x22, x27]\n"
     "smax v18.16b, v22.16b, v18.16b\n"
-    "ldr q29, [x21, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "smax v17.16b, v21.16b, v17.16b\n"
-    "ldr q28, [x20, x27]\n"
     "smax v16.16b, v20.16b, v16.16b\n"
-    "ldr q27, [x23, x26]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "subs x25, x25, #0x1\n"
     "smax v8.16b, v8.16b, v19.16b\n"
-    "ldr q21, [x22, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
     "smax v7.16b, v7.16b, v18.16b\n"
-    "ldr q26, [x21, x26]\n"
     "smax v6.16b, v6.16b, v17.16b\n"
-    "ldr q17, [x20, x26]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "smax v5.16b, v5.16b, v16.16b\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "add x22, x22, #0x20\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "smax v22.16b, v31.16b, v30.16b\n"
-    "smax v18.16b, v29.16b, v28.16b\n"
-    "smax v21.16b, v27.16b, v21.16b\n"
-    "smax v17.16b, v26.16b, v17.16b\n"
-    "smax v20.16b, v25.16b, v20.16b\n"
-    "smax v16.16b, v24.16b, v16.16b\n"
+    "smax v23.16b, v4.16b, v3.16b\n"
+    "smax v19.16b, v28.16b, v22.16b\n"
+    "smax v22.16b, v2.16b, v1.16b\n"
+    "smax v18.16b, v27.16b, v21.16b\n"
+    "smax v21.16b, v0.16b, v31.16b\n"
+    "smax v17.16b, v26.16b, v20.16b\n"
+    "smax v20.16b, v30.16b, v29.16b\n"
+    "smax v16.16b, v25.16b, v24.16b\n"
     "smax v19.16b, v23.16b, v19.16b\n"
     "smax v18.16b, v22.16b, v18.16b\n"
     "smax v17.16b, v21.16b, v17.16b\n"
@@ -133,453 +133,453 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
     "smax v6.16b, v6.16b, v17.16b\n"
     "smax v5.16b, v5.16b, v16.16b\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "smax v8.16b, v8.16b, v3.16b\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "smax v7.16b, v7.16b, v31.16b\n"
-    "ldr q25, [x23, x25]\n"
-    "smax v6.16b, v6.16b, v27.16b\n"
-    "smax v5.16b, v5.16b, v25.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
+    "ldr q17, [x20, x26]\n"
+    "ldr q16, [x20, x24]\n"
+    "smax v7.16b, v7.16b, v17.16b\n"
+    "smax v6.16b, v6.16b, v16.16b\n"
+    "ldr q16, [x20, x23]\n"
+    "smax v5.16b, v5.16b, v16.16b\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
     "sxtl v23.8h, v8.8b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    "ld1r { v4.4s }, [x19]\n"
     "sxtl2 v22.8h, v8.16b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1r { v4.4s }, [x20]\n"
     "sxtl v21.8h, v7.8b\n"
-    "ld1r { v3.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    "sxtl2 v20.8h, v7.16b\n"
-    "ld1r { v2.4s }, [x19]\n"
-    "sub %x[n_channels], %x[n_channels], #0x40\n"
-    "sxtl v19.8h, v6.8b\n"
-    "cmp %x[n_channels], #0x40\n"
-    "sxtl2 v18.8h, v6.16b\n"
+    "sxtl2 v18.8h, v7.16b\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1r { v3.4s }, [x20]\n"
+    "sxtl v20.8h, v6.8b\n"
+    "sxtl2 v19.8h, v6.16b\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1r { v2.4s }, [x20]\n"
     "sxtl v17.8h, v5.8b\n"
     "sxtl2 v16.8h, v5.16b\n"
+    "sub %x[n_channels], %x[n_channels], #0x40\n"
+    "cmp %x[n_channels], #0x40\n"
     "sxtl v1.4s, v23.4h\n"
     "sxtl2 v23.4s, v23.8h\n"
     "sxtl v0.4s, v22.4h\n"
     "sxtl2 v31.4s, v22.8h\n"
     "sxtl v30.4s, v21.4h\n"
     "sxtl2 v22.4s, v21.8h\n"
-    "sxtl v29.4s, v20.4h\n"
+    "sxtl v29.4s, v18.4h\n"
+    "sxtl2 v18.4s, v18.8h\n"
+    "sxtl v28.4s, v20.4h\n"
     "sxtl2 v21.4s, v20.8h\n"
-    "sxtl v28.4s, v19.4h\n"
-    "sxtl2 v20.4s, v19.8h\n"
-    "sxtl v27.4s, v18.4h\n"
-    "sxtl2 v26.4s, v18.8h\n"
+    "sxtl v27.4s, v19.4h\n"
+    "sxtl2 v26.4s, v19.8h\n"
     "sxtl v25.4s, v17.4h\n"
-    "sxtl2 v19.4s, v17.8h\n"
+    "sxtl2 v20.4s, v17.8h\n"
     "sxtl v24.4s, v16.4h\n"
-    "sxtl2 v18.4s, v16.8h\n"
-    "srshl v1.4s, v1.4s, v3.4s\n"
-    "srshl v23.4s, v23.4s, v3.4s\n"
-    "srshl v0.4s, v0.4s, v3.4s\n"
-    "srshl v31.4s, v31.4s, v3.4s\n"
-    "sqrdmulh v1.4s, v1.4s, v4.4s\n"
-    "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-    "sqrdmulh v0.4s, v0.4s, v4.4s\n"
-    "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+    "sxtl2 v19.4s, v16.8h\n"
+    "srshl v1.4s, v1.4s, v4.4s\n"
+    "srshl v23.4s, v23.4s, v4.4s\n"
+    "srshl v0.4s, v0.4s, v4.4s\n"
+    "srshl v31.4s, v31.4s, v4.4s\n"
+    "srshl v30.4s, v30.4s, v4.4s\n"
+    "srshl v22.4s, v22.4s, v4.4s\n"
+    "srshl v29.4s, v29.4s, v4.4s\n"
+    "srshl v18.4s, v18.4s, v4.4s\n"
+    "srshl v28.4s, v28.4s, v4.4s\n"
+    "srshl v21.4s, v21.4s, v4.4s\n"
+    "srshl v27.4s, v27.4s, v4.4s\n"
+    "srshl v26.4s, v26.4s, v4.4s\n"
+    "srshl v25.4s, v25.4s, v4.4s\n"
+    "srshl v20.4s, v20.4s, v4.4s\n"
+    "srshl v24.4s, v24.4s, v4.4s\n"
+    "srshl v19.4s, v19.4s, v4.4s\n"
+    "sqrdmulh v1.4s, v1.4s, v3.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v3.4s\n"
+    "sqrdmulh v0.4s, v0.4s, v3.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v3.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v3.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v3.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v3.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v3.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v3.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v3.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v3.4s\n"
+    "sqrdmulh v26.4s, v26.4s, v3.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v3.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v3.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v3.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v3.4s\n"
+    "movi v17.4s, #0x7f\n"
     "srshl v1.4s, v1.4s, v2.4s\n"
     "srshl v23.4s, v23.4s, v2.4s\n"
     "srshl v0.4s, v0.4s, v2.4s\n"
     "srshl v31.4s, v31.4s, v2.4s\n"
-    "srshl v30.4s, v30.4s, v3.4s\n"
-    "srshl v22.4s, v22.4s, v3.4s\n"
-    "srshl v29.4s, v29.4s, v3.4s\n"
-    "srshl v21.4s, v21.4s, v3.4s\n"
-    "sqrdmulh v30.4s, v30.4s, v4.4s\n"
-    "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-    "sqrdmulh v29.4s, v29.4s, v4.4s\n"
-    "sqrdmulh v21.4s, v21.4s, v4.4s\n"
     "srshl v30.4s, v30.4s, v2.4s\n"
     "srshl v22.4s, v22.4s, v2.4s\n"
     "srshl v29.4s, v29.4s, v2.4s\n"
-    "srshl v21.4s, v21.4s, v2.4s\n"
-    "srshl v28.4s, v28.4s, v3.4s\n"
-    "srshl v20.4s, v20.4s, v3.4s\n"
-    "srshl v27.4s, v27.4s, v3.4s\n"
-    "srshl v26.4s, v26.4s, v3.4s\n"
-    "sqrdmulh v28.4s, v28.4s, v4.4s\n"
-    "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-    "sqrdmulh v27.4s, v27.4s, v4.4s\n"
-    "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+    "srshl v18.4s, v18.4s, v2.4s\n"
     "srshl v28.4s, v28.4s, v2.4s\n"
-    "srshl v20.4s, v20.4s, v2.4s\n"
+    "srshl v21.4s, v21.4s, v2.4s\n"
     "srshl v27.4s, v27.4s, v2.4s\n"
     "srshl v26.4s, v26.4s, v2.4s\n"
-    "srshl v25.4s, v25.4s, v3.4s\n"
-    "srshl v19.4s, v19.4s, v3.4s\n"
-    "srshl v24.4s, v24.4s, v3.4s\n"
-    "srshl v18.4s, v18.4s, v3.4s\n"
-    "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-    "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-    "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-    "sqrdmulh v18.4s, v18.4s, v4.4s\n"
     "srshl v25.4s, v25.4s, v2.4s\n"
-    "srshl v19.4s, v19.4s, v2.4s\n"
+    "srshl v20.4s, v20.4s, v2.4s\n"
     "srshl v24.4s, v24.4s, v2.4s\n"
-    "srshl v18.4s, v18.4s, v2.4s\n"
-    "movi v17.4s, #0x7f\n"
+    "srshl v19.4s, v19.4s, v2.4s\n"
     "not v16.16b, v17.16b\n"
     "smax v1.4s, v1.4s, v16.4s\n"
     "smax v23.4s, v23.4s, v16.4s\n"
     "smax v0.4s, v0.4s, v16.4s\n"
     "smax v31.4s, v31.4s, v16.4s\n"
+    "smax v30.4s, v30.4s, v16.4s\n"
+    "smax v22.4s, v22.4s, v16.4s\n"
+    "smax v29.4s, v29.4s, v16.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "smax v28.4s, v28.4s, v16.4s\n"
+    "smax v21.4s, v21.4s, v16.4s\n"
+    "smax v27.4s, v27.4s, v16.4s\n"
+    "smax v26.4s, v26.4s, v16.4s\n"
+    "smax v25.4s, v25.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "smax v24.4s, v24.4s, v16.4s\n"
+    "smax v19.4s, v19.4s, v16.4s\n"
     "smin v1.4s, v1.4s, v17.4s\n"
     "smin v23.4s, v23.4s, v17.4s\n"
     "smin v0.4s, v0.4s, v17.4s\n"
     "smin v31.4s, v31.4s, v17.4s\n"
-    "smax v30.4s, v30.4s, v16.4s\n"
-    "smax v22.4s, v22.4s, v16.4s\n"
-    "smax v29.4s, v29.4s, v16.4s\n"
     "smin v30.4s, v30.4s, v17.4s\n"
     "smin v22.4s, v22.4s, v17.4s\n"
     "smin v29.4s, v29.4s, v17.4s\n"
-    "smax v21.4s, v21.4s, v16.4s\n"
-    "smax v28.4s, v28.4s, v16.4s\n"
-    "smax v20.4s, v20.4s, v16.4s\n"
-    "smin v21.4s, v21.4s, v17.4s\n"
+    "smin v18.4s, v18.4s, v17.4s\n"
     "smin v28.4s, v28.4s, v17.4s\n"
-    "smin v20.4s, v20.4s, v17.4s\n"
-    "smax v27.4s, v27.4s, v16.4s\n"
-    "smax v26.4s, v26.4s, v16.4s\n"
-    "smax v25.4s, v25.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v17.4s\n"
     "smin v27.4s, v27.4s, v17.4s\n"
     "smin v26.4s, v26.4s, v17.4s\n"
     "smin v25.4s, v25.4s, v17.4s\n"
-    "smax v19.4s, v19.4s, v16.4s\n"
-    "smax v24.4s, v24.4s, v16.4s\n"
-    "smax v18.4s, v18.4s, v16.4s\n"
-    "smin v19.4s, v19.4s, v17.4s\n"
+    "smin v20.4s, v20.4s, v17.4s\n"
     "smin v24.4s, v24.4s, v17.4s\n"
-    "smin v18.4s, v18.4s, v17.4s\n"
+    "smin v19.4s, v19.4s, v17.4s\n"
     "uzp1 v23.16b, v1.16b, v23.16b\n"
     "uzp1 v16.16b, v0.16b, v31.16b\n"
     "uzp1 v22.16b, v30.16b, v22.16b\n"
-    "uzp1 v21.16b, v29.16b, v21.16b\n"
-    "uzp1 v20.16b, v28.16b, v20.16b\n"
+    "uzp1 v18.16b, v29.16b, v18.16b\n"
+    "uzp1 v21.16b, v28.16b, v21.16b\n"
     "uzp1 v17.16b, v27.16b, v26.16b\n"
-    "uzp1 v19.16b, v25.16b, v19.16b\n"
-    "uzp1 v18.16b, v24.16b, v18.16b\n"
+    "uzp1 v20.16b, v25.16b, v20.16b\n"
+    "uzp1 v19.16b, v24.16b, v19.16b\n"
     "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x28]\n"
-    "uzp1 v16.16b, v22.16b, v21.16b\n"
-    "add x28, x28, #0x40\n"
-    "uzp1 v17.16b, v20.16b, v17.16b\n"
+    "uzp1 v18.16b, v22.16b, v18.16b\n"
     "str q16, [%x[outptr], x27]\n"
-    "uzp1 v16.16b, v19.16b, v18.16b\n"
     "add x27, x27, #0x40\n"
-    "str q17, [%x[outptr], x26]\n"
+    "uzp1 v17.16b, v21.16b, v17.16b\n"
+    "uzp1 v16.16b, v20.16b, v19.16b\n"
+    "str q18, [%x[outptr], x26]\n"
     "add x26, x26, #0x40\n"
-    "str q16, [%x[outptr], x25]\n"
-    "add x25, x25, #0x40\n"
+    "str q17, [%x[outptr], x24]\n"
+    "add x24, x24, #0x40\n"
+    "str q16, [%x[outptr], x23]\n"
+    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 43f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x10\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
     "movi v8.16b, #0x80\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "smax v19.16b, v23.16b, v19.16b\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "smax v8.16b, v8.16b, v19.16b\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "smax v17.16b, v4.16b, v3.16b\n"
+    "smax v16.16b, v28.16b, v22.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "smax v16.16b, v17.16b, v16.16b\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
+    "add x22, x22, #0x20\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "smax v19.16b, v23.16b, v19.16b\n"
-    "smax v8.16b, v8.16b, v19.16b\n"
+    "smax v17.16b, v4.16b, v3.16b\n"
+    "smax v16.16b, v28.16b, v22.16b\n"
+    "smax v16.16b, v17.16b, v16.16b\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "smax v8.16b, v8.16b, v3.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "sxtl v23.8h, v8.8b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    "ld1r { v4.4s }, [x19]\n"
-    "sxtl2 v22.8h, v8.16b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
-    "movi v17.4s, #0x7f\n"
-    "ld1r { v3.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    "sxtl v1.4s, v23.4h\n"
-    "ld1r { v2.4s }, [x19]\n"
-    "not v16.16b, v17.16b\n"
-    "sxtl2 v23.4s, v23.8h\n"
+    "sxtl v17.8h, v8.8b\n"
+    "sxtl2 v16.8h, v8.16b\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1r { v22.4s }, [x20]\n"
+    "sxtl v21.4s, v17.4h\n"
+    "sxtl2 v20.4s, v17.8h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1r { v17.4s }, [x20]\n"
+    "sxtl v19.4s, v16.4h\n"
+    "sxtl2 v18.4s, v16.8h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "srshl v21.4s, v21.4s, v22.4s\n"
+    "srshl v20.4s, v20.4s, v22.4s\n"
     "sub %x[n_channels], %x[n_channels], #0x10\n"
-    "sxtl v0.4s, v22.4h\n"
     "cmp %x[n_channels], #0x10\n"
-    "sxtl2 v31.4s, v22.8h\n"
-    "srshl v1.4s, v1.4s, v3.4s\n"
-    "srshl v23.4s, v23.4s, v3.4s\n"
-    "srshl v0.4s, v0.4s, v3.4s\n"
-    "srshl v31.4s, v31.4s, v3.4s\n"
-    "sqrdmulh v1.4s, v1.4s, v4.4s\n"
-    "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-    "sqrdmulh v0.4s, v0.4s, v4.4s\n"
-    "sqrdmulh v31.4s, v31.4s, v4.4s\n"
-    "srshl v1.4s, v1.4s, v2.4s\n"
-    "srshl v23.4s, v23.4s, v2.4s\n"
-    "srshl v0.4s, v0.4s, v2.4s\n"
-    "srshl v31.4s, v31.4s, v2.4s\n"
-    "smax v1.4s, v1.4s, v16.4s\n"
-    "smax v23.4s, v23.4s, v16.4s\n"
-    "smax v0.4s, v0.4s, v16.4s\n"
-    "smax v31.4s, v31.4s, v16.4s\n"
-    "smin v1.4s, v1.4s, v17.4s\n"
-    "smin v23.4s, v23.4s, v17.4s\n"
-    "smin v0.4s, v0.4s, v17.4s\n"
-    "smin v31.4s, v31.4s, v17.4s\n"
-    "uzp1 v23.16b, v1.16b, v23.16b\n"
-    "uzp1 v16.16b, v0.16b, v31.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x28]\n"
-    "add x28, x28, #0x10\n"
+    "srshl v19.4s, v19.4s, v22.4s\n"
+    "srshl v18.4s, v18.4s, v22.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v17.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+    "movi v17.4s, #0x7f\n"
+    "srshl v21.4s, v21.4s, v16.4s\n"
+    "srshl v20.4s, v20.4s, v16.4s\n"
+    "srshl v19.4s, v19.4s, v16.4s\n"
+    "srshl v18.4s, v18.4s, v16.4s\n"
+    "not v16.16b, v17.16b\n"
+    "smax v21.4s, v21.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "smax v19.4s, v19.4s, v16.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v17.4s\n"
+    "smin v20.4s, v20.4s, v17.4s\n"
+    "smin v19.4s, v19.4s, v17.4s\n"
+    "smin v18.4s, v18.4s, v17.4s\n"
+    "uzp1 v17.16b, v21.16b, v20.16b\n"
+    "uzp1 v16.16b, v19.16b, v18.16b\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 43f\n"
     "14:"  // Oddments
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "add %x[outptr], %x[outptr], x27\n"
     "movi v8.16b, #0x80\n"
-    "add %x[outptr], %x[outptr], x28\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 24f\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 24f\n"
     "15:"  // Oddments: 4 inputs loop
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "add x21, x21, x27\n"
+    "movi v4.16b, #0x0\n"
     "movi v3.16b, #0x0\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "add x23, x23, x28\n"
-    "movi v2.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "movi v1.16b, #0x0\n"
-    "add x19, x19, #0x20\n"
-    "movi v0.16b, #0x0\n"
-    "add x22, x22, x28\n"
-    "add x21, x21, x28\n"
-    "add x20, x20, x28\n"
+    "add x20, x20, x27\n"
+    "movi v28.16b, #0x0\n"
+    "movi v22.16b, #0x0\n"
     "tbz %x[n_channels], #3, 19f\n"
-    "ldr d3, [x23], #0x8\n"
-    "ldr d2, [x22], #0x8\n"
-    "ldr d1, [x21], #0x8\n"
-    "ldr d0, [x20], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
+    "ldr d3, [x22], #0x8\n"
+    "ldr d28, [x21], #0x8\n"
+    "ldr d22, [x20], #0x8\n"
     "tbz %x[n_channels], #2, 17f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
-    "ld1 { v2.s }[2], [x22], #0x4\n"
-    "ld1 { v1.s }[2], [x21], #0x4\n"
-    "ld1 { v0.s }[2], [x20], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
+    "ld1 { v3.s }[2], [x22], #0x4\n"
+    "ld1 { v28.s }[2], [x21], #0x4\n"
+    "ld1 { v22.s }[2], [x20], #0x4\n"
     "tbz %x[n_channels], #1, 16f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
-    "ld1 { v2.h }[6], [x22], #0x2\n"
-    "ld1 { v1.h }[6], [x21], #0x2\n"
-    "ld1 { v0.h }[6], [x20], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
+    "ld1 { v3.h }[6], [x22], #0x2\n"
+    "ld1 { v28.h }[6], [x21], #0x2\n"
+    "ld1 { v22.h }[6], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[14], [x23], #0x1\n"
-    "ld1 { v2.b }[14], [x22], #0x1\n"
-    "ld1 { v1.b }[14], [x21], #0x1\n"
-    "ld1 { v0.b }[14], [x20], #0x1\n"
+    "ld1 { v4.b }[14], [x23], #0x1\n"
+    "ld1 { v3.b }[14], [x22], #0x1\n"
+    "ld1 { v28.b }[14], [x21], #0x1\n"
+    "ld1 { v22.b }[14], [x20], #0x1\n"
     "b 23f\n"
     "16:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[12], [x23], #0x1\n"
-    "ld1 { v2.b }[12], [x22], #0x1\n"
-    "ld1 { v1.b }[12], [x21], #0x1\n"
-    "ld1 { v0.b }[12], [x20], #0x1\n"
+    "ld1 { v4.b }[12], [x23], #0x1\n"
+    "ld1 { v3.b }[12], [x22], #0x1\n"
+    "ld1 { v28.b }[12], [x21], #0x1\n"
+    "ld1 { v22.b }[12], [x20], #0x1\n"
     "b 23f\n"
     "17:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 18f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
-    "ld1 { v2.h }[4], [x22], #0x2\n"
-    "ld1 { v1.h }[4], [x21], #0x2\n"
-    "ld1 { v0.h }[4], [x20], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
+    "ld1 { v3.h }[4], [x22], #0x2\n"
+    "ld1 { v28.h }[4], [x21], #0x2\n"
+    "ld1 { v22.h }[4], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[10], [x23], #0x1\n"
-    "ld1 { v2.b }[10], [x22], #0x1\n"
-    "ld1 { v1.b }[10], [x21], #0x1\n"
-    "ld1 { v0.b }[10], [x20], #0x1\n"
+    "ld1 { v4.b }[10], [x23], #0x1\n"
+    "ld1 { v3.b }[10], [x22], #0x1\n"
+    "ld1 { v28.b }[10], [x21], #0x1\n"
+    "ld1 { v22.b }[10], [x20], #0x1\n"
     "b 23f\n"
     "18:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[8], [x23], #0x1\n"
-    "ld1 { v2.b }[8], [x22], #0x1\n"
-    "ld1 { v1.b }[8], [x21], #0x1\n"
-    "ld1 { v0.b }[8], [x20], #0x1\n"
+    "ld1 { v4.b }[8], [x23], #0x1\n"
+    "ld1 { v3.b }[8], [x22], #0x1\n"
+    "ld1 { v28.b }[8], [x21], #0x1\n"
+    "ld1 { v22.b }[8], [x20], #0x1\n"
     "b 23f\n"
     "19:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset
     "tbz %x[n_channels], #2, 21f\n"
-    "ldr s3, [x23], #0x4\n"
-    "ldr s2, [x22], #0x4\n"
-    "ldr s1, [x21], #0x4\n"
-    "ldr s0, [x20], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
+    "ldr s3, [x22], #0x4\n"
+    "ldr s28, [x21], #0x4\n"
+    "ldr s22, [x20], #0x4\n"
     "tbz %x[n_channels], #1, 20f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
-    "ld1 { v2.h }[2], [x22], #0x2\n"
-    "ld1 { v1.h }[2], [x21], #0x2\n"
-    "ld1 { v0.h }[2], [x20], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v3.h }[2], [x22], #0x2\n"
+    "ld1 { v28.h }[2], [x21], #0x2\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[6], [x23], #0x1\n"
-    "ld1 { v2.b }[6], [x22], #0x1\n"
-    "ld1 { v1.b }[6], [x21], #0x1\n"
-    "ld1 { v0.b }[6], [x20], #0x1\n"
+    "ld1 { v4.b }[6], [x23], #0x1\n"
+    "ld1 { v3.b }[6], [x22], #0x1\n"
+    "ld1 { v28.b }[6], [x21], #0x1\n"
+    "ld1 { v22.b }[6], [x20], #0x1\n"
     "b 23f\n"
     "20:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[4], [x23], #0x1\n"
-    "ld1 { v2.b }[4], [x22], #0x1\n"
-    "ld1 { v1.b }[4], [x21], #0x1\n"
-    "ld1 { v0.b }[4], [x20], #0x1\n"
+    "ld1 { v4.b }[4], [x23], #0x1\n"
+    "ld1 { v3.b }[4], [x22], #0x1\n"
+    "ld1 { v28.b }[4], [x21], #0x1\n"
+    "ld1 { v22.b }[4], [x20], #0x1\n"
     "b 23f\n"
     "21:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 22f\n"
-    "ldr h3, [x23], #0x2\n"
-    "ldr h2, [x22], #0x2\n"
-    "ldr h1, [x21], #0x2\n"
-    "ldr h0, [x20], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
+    "ldr h3, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "ldr h22, [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[2], [x23], #0x1\n"
-    "ld1 { v2.b }[2], [x22], #0x1\n"
-    "ld1 { v1.b }[2], [x21], #0x1\n"
-    "ld1 { v0.b }[2], [x20], #0x1\n"
+    "ld1 { v4.b }[2], [x23], #0x1\n"
+    "ld1 { v3.b }[2], [x22], #0x1\n"
+    "ld1 { v28.b }[2], [x21], #0x1\n"
+    "ld1 { v22.b }[2], [x20], #0x1\n"
     "b 23f\n"
     "22:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ldr b3, [x23], #0x1\n"
-    "ldr b2, [x22], #0x1\n"
-    "ldr b1, [x21], #0x1\n"
-    "ldr b0, [x20], #0x1\n"
+    "ldr b4, [x23], #0x1\n"
+    "ldr b3, [x22], #0x1\n"
+    "ldr b28, [x21], #0x1\n"
+    "ldr b22, [x20], #0x1\n"
     "23:"  // Oddments: 4 inputs loop: Load: Bit 3: End
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "subs x24, x24, #0x1\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "smax v19.16b, v23.16b, v19.16b\n"
-    "smax v8.16b, v8.16b, v19.16b\n"
+    "smax v17.16b, v4.16b, v3.16b\n"
+    "smax v16.16b, v28.16b, v22.16b\n"
+    "subs x25, x25, #0x1\n"
+    "smax v16.16b, v17.16b, v16.16b\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
     "bgt 15b\n"
     "24:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 34f\n"
     "25:"  // Oddments: Single input loop
-    "movi v3.16b, #0x0\n"
-    "ldr x23, [x19], #0x8\n"
-    "add x23, x23, x28\n"
+    "ldr x23, [x24], #0x8\n"
+    "add x23, x23, x27\n"
+    "movi v4.16b, #0x0\n"
     "tbz %x[n_channels], #3, 29f\n"
-    "ldr d3, [x23], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
     "tbz %x[n_channels], #2, 27f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
     "tbz %x[n_channels], #1, 26f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[14], [x23], #0x1\n"
+    "ld1 { v4.b }[14], [x23], #0x1\n"
     "b 33f\n"
     "26:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[12], [x23], #0x1\n"
+    "ld1 { v4.b }[12], [x23], #0x1\n"
     "b 33f\n"
     "27:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 28f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[10], [x23], #0x1\n"
+    "ld1 { v4.b }[10], [x23], #0x1\n"
     "b 33f\n"
     "28:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[8], [x23], #0x1\n"
+    "ld1 { v4.b }[8], [x23], #0x1\n"
     "b 33f\n"
     "29:"  // Oddments: Single input loop: Load: Bit 3: Unset
     "tbz %x[n_channels], #2, 31f\n"
-    "ldr s3, [x23], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
     "tbz %x[n_channels], #1, 30f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[6], [x23], #0x1\n"
+    "ld1 { v4.b }[6], [x23], #0x1\n"
     "b 33f\n"
     "30:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[4], [x23], #0x1\n"
+    "ld1 { v4.b }[4], [x23], #0x1\n"
     "b 33f\n"
     "31:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 32f\n"
-    "ldr h3, [x23], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[2], [x23], #0x1\n"
+    "ld1 { v4.b }[2], [x23], #0x1\n"
     "b 33f\n"
     "32:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ldr b3, [x23], #0x1\n"
+    "ldr b4, [x23], #0x1\n"
     "33:"  // Oddments: Single input loop: Load: Bit 3: End
-    "smax v8.16b, v8.16b, v3.16b\n"
-    "subs x20, x20, #0x1\n"
+    "subs x21, x21, #0x1\n"
+    "smax v8.16b, v8.16b, v4.16b\n"
     "bgt 25b\n"
     "34:"  // Oddments: Single input loop: End
-    "sxtl v23.8h, v8.8b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    "ld1r { v4.4s }, [x19]\n"
-    "sxtl2 v22.8h, v8.16b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "sxtl v17.8h, v8.8b\n"
+    "sxtl2 v16.8h, v8.16b\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1r { v22.4s }, [x20]\n"
+    "sxtl v21.4s, v17.4h\n"
+    "sxtl2 v20.4s, v17.8h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1r { v17.4s }, [x20]\n"
+    "sxtl v19.4s, v16.4h\n"
+    "sxtl2 v18.4s, v16.8h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "srshl v21.4s, v21.4s, v22.4s\n"
+    "srshl v20.4s, v20.4s, v22.4s\n"
+    "srshl v19.4s, v19.4s, v22.4s\n"
+    "srshl v18.4s, v18.4s, v22.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v17.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v17.4s\n"
     "movi v17.4s, #0x7f\n"
-    "ld1r { v3.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    "sxtl v1.4s, v23.4h\n"
-    "ld1r { v2.4s }, [x19]\n"
+    "srshl v21.4s, v21.4s, v16.4s\n"
+    "srshl v20.4s, v20.4s, v16.4s\n"
+    "srshl v19.4s, v19.4s, v16.4s\n"
+    "srshl v18.4s, v18.4s, v16.4s\n"
     "not v16.16b, v17.16b\n"
-    "sxtl2 v23.4s, v23.8h\n"
-    "sxtl v0.4s, v22.4h\n"
-    "sxtl2 v31.4s, v22.8h\n"
-    "srshl v1.4s, v1.4s, v3.4s\n"
-    "srshl v23.4s, v23.4s, v3.4s\n"
-    "srshl v0.4s, v0.4s, v3.4s\n"
-    "srshl v31.4s, v31.4s, v3.4s\n"
-    "sqrdmulh v1.4s, v1.4s, v4.4s\n"
-    "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-    "sqrdmulh v0.4s, v0.4s, v4.4s\n"
-    "sqrdmulh v31.4s, v31.4s, v4.4s\n"
-    "srshl v1.4s, v1.4s, v2.4s\n"
-    "srshl v23.4s, v23.4s, v2.4s\n"
-    "srshl v0.4s, v0.4s, v2.4s\n"
-    "srshl v31.4s, v31.4s, v2.4s\n"
-    "smax v1.4s, v1.4s, v16.4s\n"
-    "smax v23.4s, v23.4s, v16.4s\n"
-    "smax v0.4s, v0.4s, v16.4s\n"
-    "smax v31.4s, v31.4s, v16.4s\n"
-    "smin v1.4s, v1.4s, v17.4s\n"
-    "smin v23.4s, v23.4s, v17.4s\n"
-    "smin v0.4s, v0.4s, v17.4s\n"
-    "smin v31.4s, v31.4s, v17.4s\n"
-    "uzp1 v23.16b, v1.16b, v23.16b\n"
-    "uzp1 v16.16b, v0.16b, v31.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
+    "smax v21.4s, v21.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "smax v19.4s, v19.4s, v16.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v17.4s\n"
+    "smin v20.4s, v20.4s, v17.4s\n"
+    "smin v19.4s, v19.4s, v17.4s\n"
+    "smin v18.4s, v18.4s, v17.4s\n"
+    "uzp1 v17.16b, v21.16b, v20.16b\n"
+    "uzp1 v16.16b, v19.16b, v18.16b\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
     "tbz %x[n_channels], #3, 38f\n"
     "st1 { v16.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #2, 36f\n"
@@ -625,12 +625,10 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 42f\n"
     "st1 { v16.b }[0], [%x[outptr]], #0x1\n"
     "42:"  // Oddments: Store: Bit 3: End
-
     "43:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [quant_params] "r" (&qp)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst.hpp
index 230952452b..97818595e8 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_u8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
 
-struct a64_u8_nhwc_avg_generic_depthfirst
+struct a64_u8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = a64_u8_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
   a64_u8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_u8_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
index 2c8a29248d..f8984c451c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 #include <cstring>
 #include <cmath>
 
@@ -84,26 +85,27 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
       f_rescale_value *= 2.0f;
     }
 
-    rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
-    if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
     {
       shift_value++;
-      rescale_value >>= 1;
+      long_rescale_value >>= 1;
     }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
   }
 
   __asm__ __volatile__(
-    "mov x26, #0x0\n"
-    "mov x25, #0x10\n" // cntb _, ALL, #1
-    "mov x24, #0x20\n" // cntb _, ALL, #2
-    "mov x23, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x40\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x25, #0x20\n"  // cntb _, ALL, #2
+    "mov x24, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "movi v15.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v14.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
     "movi v12.4s, #0x0\n"
     "movi v11.4s, #0x0\n"
@@ -118,43 +120,43 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
     "movi v2.4s, #0x0\n"
     "movi v1.4s, #0x0\n"
     "movi v0.4s, #0x0\n"
-    "cbz x22, 4f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ldr q31, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ldr q30, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
-    "ldr q29, [x21, x25]\n"
-    "ldr q28, [x20, x25]\n"
-    "ldr q27, [x21, x24]\n"
-    "ldr q26, [x20, x24]\n"
-    "ldr q25, [x21, x23]\n"
-    "ldr q24, [x20, x23]\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ldr q30, [x20, x27]\n"
+    "ldr q29, [x21, x26]\n"
+    "ldr q28, [x20, x26]\n"
+    "ldr q27, [x21, x25]\n"
+    "ldr q26, [x20, x25]\n"
+    "ldr q25, [x21, x24]\n"
+    "ldr q24, [x20, x24]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 2 inputs loop
     "uaddl v23.8h, v31.8b, v30.8b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
     "uaddl2 v22.8h, v31.16b, v30.16b\n"
-    "ldr q31, [x21, x26]\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "ldr q30, [x20, x27]\n"
     "uaddl v21.8h, v29.8b, v28.8b\n"
-    "subs x22, x22, #0x1\n"
     "uaddl2 v20.8h, v29.16b, v28.16b\n"
-    "ldr q30, [x20, x26]\n"
+    "ldr q29, [x21, x26]\n"
+    "ldr q28, [x20, x26]\n"
     "uaddl v19.8h, v27.8b, v26.8b\n"
-    "ldr q29, [x21, x25]\n"
     "uaddl2 v18.8h, v27.16b, v26.16b\n"
-    "ldr q28, [x20, x25]\n"
+    "ldr q27, [x21, x25]\n"
+    "ldr q26, [x20, x25]\n"
     "uaddl v17.8h, v25.8b, v24.8b\n"
-    "ldr q27, [x21, x24]\n"
     "uaddl2 v16.8h, v25.16b, v24.16b\n"
-    "ldr q26, [x20, x24]\n"
+    "ldr q25, [x21, x24]\n"
+    "ldr q24, [x20, x24]\n"
+    "subs x23, x23, #0x1\n"
     "uaddw v15.4s, v15.4s, v23.4h\n"
-    "ldr q25, [x21, x23]\n"
     "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "ldr q24, [x20, x23]\n"
     "uaddw v13.4s, v13.4s, v22.4h\n"
     "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "add x22, x22, #0x10\n"
     "uaddw v11.4s, v11.4s, v21.4h\n"
     "uaddw2 v10.4s, v10.4s, v21.8h\n"
     "uaddw v9.4s, v9.4s, v20.4h\n"
@@ -194,23 +196,23 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
     "uaddw v1.4s, v1.4s, v16.4h\n"
     "uaddw2 v0.4s, v0.4s, v16.8h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q31, [x21, x26]\n"
-    "uxtl v23.8h, v31.8b\n"
-    "ldr q29, [x21, x25]\n"
-    "uxtl2 v22.8h, v31.16b\n"
-    "ldr q27, [x21, x24]\n"
-    "ldr q25, [x21, x23]\n"
-    "uxtl v21.8h, v29.8b\n"
-    "uxtl2 v20.8h, v29.16b\n"
-    "uxtl v19.8h, v27.8b\n"
-    "uxtl2 v18.8h, v27.16b\n"
-    "uxtl v17.8h, v25.8b\n"
-    "uxtl2 v16.8h, v25.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "uxtl v23.8h, v16.8b\n"
+    "uxtl2 v22.8h, v16.16b\n"
+    "ldr q16, [x20, x26]\n"
+    "ldr q17, [x20, x25]\n"
+    "uxtl v21.8h, v16.8b\n"
+    "uxtl2 v20.8h, v16.16b\n"
+    "ldr q16, [x20, x24]\n"
+    "uxtl v19.8h, v17.8b\n"
+    "uxtl2 v18.8h, v17.16b\n"
+    "subs x23, x23, #0x1\n"
+    "uxtl v17.8h, v16.8b\n"
+    "uxtl2 v16.8h, v16.16b\n"
     "uaddw v15.4s, v15.4s, v23.4h\n"
     "uaddw2 v14.4s, v14.4s, v23.8h\n"
     "uaddw v13.4s, v13.4s, v22.4h\n"
@@ -229,195 +231,195 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
     "uaddw2 v0.4s, v0.4s, v16.8h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "movi v19.4s, #0x0\n"
-    "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
-    "sub %x[n_channels], %x[n_channels], #0x40\n"
-    "movi v17.4s, #0xff\n"
+    "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
     "ld1r { v16.4s }, [%x[shift_ptr]]\n"
+    "sqdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqdmulh v14.4s, v14.4s, v17.4s\n"
+    "sqdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqdmulh v12.4s, v12.4s, v17.4s\n"
+    "sub %x[n_channels], %x[n_channels], #0x40\n"
     "cmp %x[n_channels], #0x40\n"
-    "sqdmulh v15.4s, v15.4s, v18.4s\n"
-    "sqdmulh v14.4s, v14.4s, v18.4s\n"
-    "sqdmulh v13.4s, v13.4s, v18.4s\n"
-    "sqdmulh v12.4s, v12.4s, v18.4s\n"
-    "sqdmulh v11.4s, v11.4s, v18.4s\n"
+    "sqdmulh v11.4s, v11.4s, v17.4s\n"
+    "sqdmulh v10.4s, v10.4s, v17.4s\n"
+    "sqdmulh v9.4s, v9.4s, v17.4s\n"
+    "sqdmulh v8.4s, v8.4s, v17.4s\n"
+    "sqdmulh v7.4s, v7.4s, v17.4s\n"
+    "sqdmulh v6.4s, v6.4s, v17.4s\n"
+    "sqdmulh v5.4s, v5.4s, v17.4s\n"
+    "sqdmulh v4.4s, v4.4s, v17.4s\n"
+    "sqdmulh v3.4s, v3.4s, v17.4s\n"
+    "sqdmulh v2.4s, v2.4s, v17.4s\n"
+    "sqdmulh v1.4s, v1.4s, v17.4s\n"
+    "sqdmulh v0.4s, v0.4s, v17.4s\n"
     "srshl v15.4s, v15.4s, v16.4s\n"
     "srshl v14.4s, v14.4s, v16.4s\n"
     "srshl v13.4s, v13.4s, v16.4s\n"
     "srshl v12.4s, v12.4s, v16.4s\n"
     "srshl v11.4s, v11.4s, v16.4s\n"
-    "sqdmulh v10.4s, v10.4s, v18.4s\n"
-    "sqdmulh v9.4s, v9.4s, v18.4s\n"
-    "sqdmulh v8.4s, v8.4s, v18.4s\n"
-    "sqdmulh v7.4s, v7.4s, v18.4s\n"
     "srshl v10.4s, v10.4s, v16.4s\n"
     "srshl v9.4s, v9.4s, v16.4s\n"
     "srshl v8.4s, v8.4s, v16.4s\n"
     "srshl v7.4s, v7.4s, v16.4s\n"
-    "sqdmulh v6.4s, v6.4s, v18.4s\n"
-    "sqdmulh v5.4s, v5.4s, v18.4s\n"
-    "sqdmulh v4.4s, v4.4s, v18.4s\n"
-    "sqdmulh v3.4s, v3.4s, v18.4s\n"
     "srshl v6.4s, v6.4s, v16.4s\n"
     "srshl v5.4s, v5.4s, v16.4s\n"
     "srshl v4.4s, v4.4s, v16.4s\n"
     "srshl v3.4s, v3.4s, v16.4s\n"
-    "sqdmulh v2.4s, v2.4s, v18.4s\n"
-    "sqdmulh v1.4s, v1.4s, v18.4s\n"
-    "sqdmulh v0.4s, v0.4s, v18.4s\n"
-    "smax v15.4s, v15.4s, v19.4s\n"
     "srshl v2.4s, v2.4s, v16.4s\n"
     "srshl v1.4s, v1.4s, v16.4s\n"
     "srshl v0.4s, v0.4s, v16.4s\n"
-    "smin v15.4s, v15.4s, v17.4s\n"
-    "smax v14.4s, v14.4s, v19.4s\n"
-    "smax v13.4s, v13.4s, v19.4s\n"
-    "smax v12.4s, v12.4s, v19.4s\n"
-    "smin v14.4s, v14.4s, v17.4s\n"
-    "smin v13.4s, v13.4s, v17.4s\n"
-    "smin v12.4s, v12.4s, v17.4s\n"
-    "smax v11.4s, v11.4s, v19.4s\n"
-    "smax v10.4s, v10.4s, v19.4s\n"
-    "smax v9.4s, v9.4s, v19.4s\n"
-    "smin v11.4s, v11.4s, v17.4s\n"
-    "smin v10.4s, v10.4s, v17.4s\n"
-    "smin v9.4s, v9.4s, v17.4s\n"
-    "smax v8.4s, v8.4s, v19.4s\n"
-    "smax v7.4s, v7.4s, v19.4s\n"
-    "smax v6.4s, v6.4s, v19.4s\n"
-    "smin v8.4s, v8.4s, v17.4s\n"
-    "smin v7.4s, v7.4s, v17.4s\n"
-    "smin v6.4s, v6.4s, v17.4s\n"
-    "smax v5.4s, v5.4s, v19.4s\n"
-    "smax v4.4s, v4.4s, v19.4s\n"
-    "smax v3.4s, v3.4s, v19.4s\n"
-    "smin v5.4s, v5.4s, v17.4s\n"
-    "smin v4.4s, v4.4s, v17.4s\n"
-    "smin v3.4s, v3.4s, v17.4s\n"
-    "smax v2.4s, v2.4s, v19.4s\n"
-    "smax v1.4s, v1.4s, v19.4s\n"
-    "smax v0.4s, v0.4s, v19.4s\n"
-    "smin v2.4s, v2.4s, v17.4s\n"
-    "smin v1.4s, v1.4s, v17.4s\n"
-    "smin v0.4s, v0.4s, v17.4s\n"
+    "movi v16.4s, #0x0\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smax v14.4s, v14.4s, v16.4s\n"
+    "smax v13.4s, v13.4s, v16.4s\n"
+    "smax v12.4s, v12.4s, v16.4s\n"
+    "smax v11.4s, v11.4s, v16.4s\n"
+    "smax v10.4s, v10.4s, v16.4s\n"
+    "smax v9.4s, v9.4s, v16.4s\n"
+    "smax v8.4s, v8.4s, v16.4s\n"
+    "smax v7.4s, v7.4s, v16.4s\n"
+    "smax v6.4s, v6.4s, v16.4s\n"
+    "smax v5.4s, v5.4s, v16.4s\n"
+    "smax v4.4s, v4.4s, v16.4s\n"
+    "smax v3.4s, v3.4s, v16.4s\n"
+    "smax v2.4s, v2.4s, v16.4s\n"
+    "smax v1.4s, v1.4s, v16.4s\n"
+    "smax v0.4s, v0.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v15.4s, v15.4s, v16.4s\n"
+    "smin v14.4s, v14.4s, v16.4s\n"
+    "smin v13.4s, v13.4s, v16.4s\n"
+    "smin v12.4s, v12.4s, v16.4s\n"
+    "smin v11.4s, v11.4s, v16.4s\n"
+    "smin v10.4s, v10.4s, v16.4s\n"
+    "smin v9.4s, v9.4s, v16.4s\n"
+    "smin v8.4s, v8.4s, v16.4s\n"
+    "smin v7.4s, v7.4s, v16.4s\n"
+    "smin v6.4s, v6.4s, v16.4s\n"
+    "smin v5.4s, v5.4s, v16.4s\n"
+    "smin v4.4s, v4.4s, v16.4s\n"
+    "smin v3.4s, v3.4s, v16.4s\n"
+    "smin v2.4s, v2.4s, v16.4s\n"
+    "smin v1.4s, v1.4s, v16.4s\n"
+    "smin v0.4s, v0.4s, v16.4s\n"
     "uzp1 v23.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
     "uzp1 v22.16b, v11.16b, v10.16b\n"
-    "uzp1 v21.16b, v9.16b, v8.16b\n"
-    "uzp1 v20.16b, v7.16b, v6.16b\n"
+    "uzp1 v18.16b, v9.16b, v8.16b\n"
+    "uzp1 v21.16b, v7.16b, v6.16b\n"
     "uzp1 v17.16b, v5.16b, v4.16b\n"
-    "uzp1 v19.16b, v3.16b, v2.16b\n"
-    "uzp1 v18.16b, v1.16b, v0.16b\n"
+    "uzp1 v20.16b, v3.16b, v2.16b\n"
+    "uzp1 v19.16b, v1.16b, v0.16b\n"
     "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x26]\n"
-    "uzp1 v16.16b, v22.16b, v21.16b\n"
+    "uzp1 v18.16b, v22.16b, v18.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x40\n"
+    "uzp1 v17.16b, v21.16b, v17.16b\n"
+    "uzp1 v16.16b, v20.16b, v19.16b\n"
+    "str q18, [%x[outptr], x26]\n"
     "add x26, x26, #0x40\n"
-    "uzp1 v17.16b, v20.16b, v17.16b\n"
-    "str q16, [%x[outptr], x25]\n"
-    "uzp1 v16.16b, v19.16b, v18.16b\n"
+    "str q17, [%x[outptr], x25]\n"
     "add x25, x25, #0x40\n"
-    "str q17, [%x[outptr], x24]\n"
+    "str q16, [%x[outptr], x24]\n"
     "add x24, x24, #0x40\n"
-    "str q16, [%x[outptr], x23]\n"
-    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 43f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x10\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "movi v15.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v14.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
     "movi v12.4s, #0x0\n"
-    "cbz x22, 11f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ldr q31, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ldr q30, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ldr q30, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 2 inputs loop
-    "uaddl v23.8h, v31.8b, v30.8b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
-    "uaddl2 v22.8h, v31.16b, v30.16b\n"
-    "ldr q31, [x21, x26]\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "subs x22, x22, #0x1\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "ldr q30, [x20, x26]\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "uaddl v17.8h, v31.8b, v30.8b\n"
+    "uaddl2 v16.8h, v31.16b, v30.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "ldr q30, [x20, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
+    "add x22, x22, #0x10\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 2 inputs tail
-    "uaddl v23.8h, v31.8b, v30.8b\n"
-    "uaddl2 v22.8h, v31.16b, v30.16b\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "uaddl v17.8h, v31.8b, v30.8b\n"
+    "uaddl2 v16.8h, v31.16b, v30.16b\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q31, [x21, x26]\n"
-    "uxtl v23.8h, v31.8b\n"
-    "uxtl2 v22.8h, v31.16b\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "uxtl v17.8h, v16.8b\n"
+    "uxtl2 v16.8h, v16.16b\n"
+    "subs x23, x23, #0x1\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "movi v19.4s, #0x0\n"
-    "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
-    "sub %x[n_channels], %x[n_channels], #0x10\n"
-    "movi v17.4s, #0xff\n"
+    "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
     "ld1r { v16.4s }, [%x[shift_ptr]]\n"
+    "sqdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqdmulh v14.4s, v14.4s, v17.4s\n"
+    "sqdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqdmulh v12.4s, v12.4s, v17.4s\n"
+    "sub %x[n_channels], %x[n_channels], #0x10\n"
     "cmp %x[n_channels], #0x10\n"
-    "sqdmulh v15.4s, v15.4s, v18.4s\n"
-    "sqdmulh v14.4s, v14.4s, v18.4s\n"
-    "sqdmulh v13.4s, v13.4s, v18.4s\n"
-    "sqdmulh v12.4s, v12.4s, v18.4s\n"
     "srshl v15.4s, v15.4s, v16.4s\n"
     "srshl v14.4s, v14.4s, v16.4s\n"
     "srshl v13.4s, v13.4s, v16.4s\n"
     "srshl v12.4s, v12.4s, v16.4s\n"
-    "smax v15.4s, v15.4s, v19.4s\n"
-    "smax v14.4s, v14.4s, v19.4s\n"
-    "smax v13.4s, v13.4s, v19.4s\n"
-    "smax v12.4s, v12.4s, v19.4s\n"
-    "smin v15.4s, v15.4s, v17.4s\n"
-    "smin v14.4s, v14.4s, v17.4s\n"
-    "smin v13.4s, v13.4s, v17.4s\n"
-    "smin v12.4s, v12.4s, v17.4s\n"
-    "uzp1 v23.16b, v15.16b, v14.16b\n"
+    "movi v16.4s, #0x0\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smax v14.4s, v14.4s, v16.4s\n"
+    "smax v13.4s, v13.4s, v16.4s\n"
+    "smax v12.4s, v12.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v15.4s, v15.4s, v16.4s\n"
+    "smin v14.4s, v14.4s, v16.4s\n"
+    "smin v13.4s, v13.4s, v16.4s\n"
+    "smin v12.4s, v12.4s, v16.4s\n"
+    "uzp1 v17.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x26]\n"
-    "add x26, x26, #0x10\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 43f\n"
     "14:"  // Oddments
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "add %x[outptr], %x[outptr], x27\n"
     "movi v15.4s, #0x0\n"
-    "add %x[outptr], %x[outptr], x26\n"
     "movi v14.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
     "movi v12.4s, #0x0\n"
-    "cbz x22, 24f\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x23, 24f\n"
     "15:"  // Oddments: 2 inputs loop
+    "ldp x21, x20, [x22, #0x0]\n"
+    "add x22, x22, #0x10\n"
+    "add x21, x21, x27\n"
     "movi v31.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
+    "add x20, x20, x27\n"
     "movi v30.16b, #0x0\n"
-    "add x21, x21, x26\n"
-    "add x20, x20, x26\n"
     "tbz %x[n_channels], #3, 19f\n"
     "ldr d31, [x21], #0x8\n"
     "ldr d30, [x20], #0x8\n"
@@ -478,21 +480,21 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
     "ldr b31, [x21], #0x1\n"
     "ldr b30, [x20], #0x1\n"
     "23:"  // Oddments: 2 inputs loop: Load: Bit 3: End
-    "uaddl v23.8h, v31.8b, v30.8b\n"
-    "subs x22, x22, #0x1\n"
-    "uaddl2 v22.8h, v31.16b, v30.16b\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "uaddl v17.8h, v31.8b, v30.8b\n"
+    "uaddl2 v16.8h, v31.16b, v30.16b\n"
+    "subs x23, x23, #0x1\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 15b\n"
     "24:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 34f\n"
     "25:"  // Oddments: Single input loop
+    "ldr x21, [x22], #0x8\n"
+    "add x21, x21, x27\n"
     "movi v31.16b, #0x0\n"
-    "ldr x21, [x19], #0x8\n"
-    "add x21, x21, x26\n"
     "tbz %x[n_channels], #3, 29f\n"
     "ldr d31, [x21], #0x8\n"
     "tbz %x[n_channels], #2, 27f\n"
@@ -538,38 +540,38 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 33f\n"
     "ldr b31, [x21], #0x1\n"
     "33:"  // Oddments: Single input loop: Load: Bit 3: End
-    "uxtl v23.8h, v31.8b\n"
-    "subs x20, x20, #0x1\n"
-    "uxtl2 v22.8h, v31.16b\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "uxtl v17.8h, v31.8b\n"
+    "uxtl2 v16.8h, v31.16b\n"
+    "subs x23, x23, #0x1\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 25b\n"
     "34:"  // Oddments: Single input loop: End
-    "movi v19.4s, #0x0\n"
-    "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
-    "movi v17.4s, #0xff\n"
+    "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
     "ld1r { v16.4s }, [%x[shift_ptr]]\n"
-    "sqdmulh v15.4s, v15.4s, v18.4s\n"
-    "sqdmulh v14.4s, v14.4s, v18.4s\n"
-    "sqdmulh v13.4s, v13.4s, v18.4s\n"
-    "sqdmulh v12.4s, v12.4s, v18.4s\n"
+    "sqdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqdmulh v14.4s, v14.4s, v17.4s\n"
+    "sqdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqdmulh v12.4s, v12.4s, v17.4s\n"
     "srshl v15.4s, v15.4s, v16.4s\n"
     "srshl v14.4s, v14.4s, v16.4s\n"
     "srshl v13.4s, v13.4s, v16.4s\n"
     "srshl v12.4s, v12.4s, v16.4s\n"
-    "smax v15.4s, v15.4s, v19.4s\n"
-    "smax v14.4s, v14.4s, v19.4s\n"
-    "smax v13.4s, v13.4s, v19.4s\n"
-    "smax v12.4s, v12.4s, v19.4s\n"
-    "smin v15.4s, v15.4s, v17.4s\n"
-    "smin v14.4s, v14.4s, v17.4s\n"
-    "smin v13.4s, v13.4s, v17.4s\n"
-    "smin v12.4s, v12.4s, v17.4s\n"
-    "uzp1 v23.16b, v15.16b, v14.16b\n"
+    "movi v16.4s, #0x0\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smax v14.4s, v14.4s, v16.4s\n"
+    "smax v13.4s, v13.4s, v16.4s\n"
+    "smax v12.4s, v12.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v15.4s, v15.4s, v16.4s\n"
+    "smin v14.4s, v14.4s, v16.4s\n"
+    "smin v13.4s, v13.4s, v16.4s\n"
+    "smin v12.4s, v12.4s, v16.4s\n"
+    "uzp1 v17.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
     "tbz %x[n_channels], #3, 38f\n"
     "st1 { v16.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #2, 36f\n"
@@ -615,12 +617,10 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 42f\n"
     "st1 { v16.b }[0], [%x[outptr]], #0x1\n"
     "42:"  // Oddments: Store: Bit 3: End
-
     "43:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 0103de812d..9d160bf8f8 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,33 +24,28 @@
 
 #pragma once
 
+#if defined(__aarch64__)
+
 namespace arm_conv {
 namespace pooling {
 
 void a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const uint8_t *const *const, uint8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst
+struct a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<uint8_t, uint8_t>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(unsigned int, const uint8_t *const *const, uint8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+  using Parent = DepthfirstStrategy<uint8_t, uint8_t>;
 
-  constexpr static unsigned int pool_rows(void) { return 2; }
-  constexpr static unsigned int pool_cols(void) { return 2; }
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
+  a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
-  a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 02c43ccaba..66cdb7f849 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 #include <cstddef>
 #include <cstdint>
 
+#if defined(__aarch64__)
+
 namespace arm_conv {
 namespace pooling {
 
@@ -61,114 +63,115 @@ void a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
-    "mov x14, #0x0\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x13, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "cmp x15, #0x10\n"
-    "ldp x12, x11, [x20, #0x0]\n"
-    "ldp x10, x9, [x20, #0x10]\n"
-    "ldp x28, x27, [x19, #0x0]\n"
-    "ldp x26, x25, [x19, #0x10]\n"
-    "ldp x24, x23, [x19, #0x20]\n"
-    "ldp x22, x21, [x19, #0x30]\n"
-    "ldr x20, [x19, #0x40]\n"
+    "ldr x16, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "cmp x16, #0x10\n"
+    "mov x15, #0x0\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x14, x13, [x21, #0x0]\n"
+    "mov x12, #0x0\n"
+    "ldp x11, x10, [x21, #0x10]\n"
+    "ldp x9, x28, [x20, #0x0]\n"
+    "ldp x27, x26, [x20, #0x10]\n"
+    "ldp x25, x24, [x20, #0x20]\n"
+    "ldp x23, x22, [x20, #0x30]\n"
+    "ldr x21, [x20, #0x40]\n"
     "blt 3f\n"
-    "ldr q30, [x27, x14]\n"
-    "lsr x19, x15, #0x4\n"
-    "ldr q29, [x24, x14]\n"
-    "sub x15, x15, x19, LSL #4\n"
-    "ldr q28, [x21, x14]\n"
-    "subs x19, x19, #0x1\n"
-    "ldr q27, [x25, x14]\n"
-    "ldr q26, [x28, x14]\n"
-    "ldr q25, [x23, x14]\n"
-    "ldr q24, [x26, x14]\n"
-    "ldr q23, [x22, x14]\n"
-    "ldr q22, [x20, x14]\n"
-    "add x14, x14, #0x10\n"
+    "ldr q30, [x28, x15]\n"
+    "ldr q29, [x25, x15]\n"
+    "lsr x20, x16, #0x4\n"
+    "sub x16, x16, x20, LSL #4\n"
+    "ldr q28, [x22, x15]\n"
+    "ldr q27, [x26, x15]\n"
+    "subs x20, x20, #0x1\n"
+    "ldr q26, [x9, x15]\n"
+    "ldr q25, [x27, x15]\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr q23, [x23, x15]\n"
+    "ldr q22, [x21, x15]\n"
+    "add x15, x15, #0x10\n"
     "beq 2f\n"
     "1:"  // Vector: Loop
     "umax v21.16b, v30.16b, v29.16b\n"
-    "ldr q30, [x27, x14]\n"
-    "subs x19, x19, #0x1\n"
+    "ldr q30, [x28, x15]\n"
     "umax v20.16b, v29.16b, v28.16b\n"
-    "ldr q29, [x24, x14]\n"
+    "ldr q29, [x25, x15]\n"
+    "ldr q28, [x22, x15]\n"
     "umax v19.16b, v27.16b, v26.16b\n"
-    "ldr q28, [x21, x14]\n"
+    "ldr q26, [x9, x15]\n"
     "umax v18.16b, v25.16b, v24.16b\n"
-    "ldr q26, [x28, x14]\n"
-    "umax v17.16b, v23.16b, v27.16b\n"
-    "ldr q27, [x25, x14]\n"
-    "umax v16.16b, v25.16b, v22.16b\n"
-    "ldr q25, [x23, x14]\n"
+    "ldr q25, [x27, x15]\n"
+    "umax v17.16b, v27.16b, v23.16b\n"
+    "ldr q27, [x26, x15]\n"
+    "umax v16.16b, v24.16b, v22.16b\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr q23, [x23, x15]\n"
+    "subs x20, x20, #0x1\n"
     "umax v19.16b, v21.16b, v19.16b\n"
-    "ldr q24, [x26, x14]\n"
-    "umax v18.16b, v21.16b, v18.16b\n"
-    "ldr q23, [x22, x14]\n"
-    "umax v17.16b, v20.16b, v17.16b\n"
-    "ldr q22, [x20, x14]\n"
-    "add x14, x14, #0x10\n"
+    "ldr q22, [x21, x15]\n"
+    "umax v18.16b, v18.16b, v21.16b\n"
+    "umax v17.16b, v17.16b, v20.16b\n"
+    "add x15, x15, #0x10\n"
     "umax v16.16b, v20.16b, v16.16b\n"
-    "str q19, [x12, x13]\n"
-    "str q18, [x11, x13]\n"
-    "str q17, [x10, x13]\n"
-    "str q16, [x9, x13]\n"
-    "add x13, x13, #0x10\n"
+    "str q19, [x14, x12]\n"
+    "str q18, [x13, x12]\n"
+    "str q17, [x11, x12]\n"
+    "str q16, [x10, x12]\n"
+    "add x12, x12, #0x10\n"
     "bgt 1b\n"
     "2:"  // Vector: Tail
     "umax v21.16b, v30.16b, v29.16b\n"
     "umax v20.16b, v29.16b, v28.16b\n"
-    "umax v19.16b, v27.16b, v26.16b\n"
+    "umax v16.16b, v27.16b, v26.16b\n"
     "umax v18.16b, v25.16b, v24.16b\n"
-    "umax v17.16b, v23.16b, v27.16b\n"
-    "umax v16.16b, v25.16b, v22.16b\n"
-    "umax v19.16b, v21.16b, v19.16b\n"
-    "str q19, [x12, x13]\n"
-    "umax v18.16b, v21.16b, v18.16b\n"
-    "umax v17.16b, v20.16b, v17.16b\n"
-    "str q18, [x11, x13]\n"
-    "umax v16.16b, v20.16b, v16.16b\n"
-    "str q17, [x10, x13]\n"
-    "str q16, [x9, x13]\n"
-    "add x13, x13, #0x10\n"
-    "cbz x15, 4f\n"
+    "umax v17.16b, v27.16b, v23.16b\n"
+    "umax v19.16b, v24.16b, v22.16b\n"
+    "umax v16.16b, v21.16b, v16.16b\n"
+    "umax v18.16b, v18.16b, v21.16b\n"
+    "str q16, [x14, x12]\n"
+    "umax v17.16b, v17.16b, v20.16b\n"
+    "umax v16.16b, v20.16b, v19.16b\n"
+    "str q18, [x13, x12]\n"
+    "str q17, [x11, x12]\n"
+    "str q16, [x10, x12]\n"
+    "add x12, x12, #0x10\n"
+    "cbz x16, 4f\n"
     "3:"  // Oddments
-    "ldr b30, [x27, x14]\n"
-    "subs x15, x15, #0x1\n"
-    "ldr b29, [x24, x14]\n"
-    "umax v21.16b, v30.16b, v29.16b\n"
-    "ldr b28, [x21, x14]\n"
-    "ldr b27, [x25, x14]\n"
-    "umax v20.16b, v29.16b, v28.16b\n"
-    "ldr b26, [x28, x14]\n"
-    "ldr b25, [x23, x14]\n"
-    "umax v19.16b, v27.16b, v26.16b\n"
-    "ldr b24, [x26, x14]\n"
-    "ldr b23, [x22, x14]\n"
-    "umax v19.16b, v21.16b, v19.16b\n"
-    "ldr b22, [x20, x14]\n"
-    "add x14, x14, #0x1\n"
-    "umax v18.16b, v25.16b, v24.16b\n"
-    "str b19, [x12, x13]\n"
-    "umax v17.16b, v23.16b, v27.16b\n"
-    "umax v16.16b, v25.16b, v22.16b\n"
-    "umax v18.16b, v21.16b, v18.16b\n"
-    "str b18, [x11, x13]\n"
-    "umax v17.16b, v20.16b, v17.16b\n"
-    "umax v16.16b, v20.16b, v16.16b\n"
-    "str b17, [x10, x13]\n"
-    "str b16, [x9, x13]\n"
-    "add x13, x13, #0x1\n"
+    "ldr b16, [x28, x15]\n"
+    "ldr b17, [x25, x15]\n"
+    "umax v23.16b, v16.16b, v17.16b\n"
+    "subs x16, x16, #0x1\n"
+    "ldr b16, [x22, x15]\n"
+    "ldr b22, [x26, x15]\n"
+    "umax v21.16b, v17.16b, v16.16b\n"
+    "ldr b16, [x9, x15]\n"
+    "ldr b17, [x27, x15]\n"
+    "umax v16.16b, v22.16b, v16.16b\n"
+    "umax v20.16b, v23.16b, v16.16b\n"
+    "ldr b19, [x24, x15]\n"
+    "ldr b16, [x23, x15]\n"
+    "umax v18.16b, v17.16b, v19.16b\n"
+    "umax v17.16b, v22.16b, v16.16b\n"
+    "ldr b16, [x21, x15]\n"
+    "umax v16.16b, v19.16b, v16.16b\n"
+    "add x15, x15, #0x1\n"
+    "umax v18.16b, v18.16b, v23.16b\n"
+    "umax v17.16b, v17.16b, v21.16b\n"
+    "umax v16.16b, v21.16b, v16.16b\n"
+    "str b20, [x14, x12]\n"
+    "str b18, [x13, x12]\n"
+    "str b17, [x11, x12]\n"
+    "str b16, [x10, x12]\n"
+    "add x12, x12, #0x1\n"
     "bgt 3b\n"
     "4:"  // End
-
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
-    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst.hpp
index 391af31d03..7d528ccc65 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_u8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
 
-struct a64_u8_nhwc_max_generic_depthfirst
+struct a64_u8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = a64_u8_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
   a64_u8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_u8_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
index f9bbfd8b90..2ceef125ca 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
 #if defined(__aarch64__)
 
@@ -39,397 +40,395 @@ void a64_u8_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "mov x28, #0x0\n"
-    "mov x27, #0x10\n" // cntb _, ALL, #1
-    "mov x26, #0x20\n" // cntb _, ALL, #2
-    "mov x25, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x40\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x24, #0x20\n"  // cntb _, ALL, #2
+    "mov x23, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x0\n"
     "movi v7.16b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
+    "mov x22, %x[inptrs]\n"
     "movi v6.16b, #0x0\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "movi v5.16b, #0x0\n"
-    "movi v4.16b, #0x0\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q30, [x22, x27]\n"
-    "ldr q29, [x21, x27]\n"
-    "ldr q28, [x20, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "ldr q21, [x22, x26]\n"
-    "ldr q26, [x21, x26]\n"
-    "ldr q17, [x20, x26]\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "cbz x25, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldr q2, [x21, x26]\n"
+    "ldr q1, [x20, x26]\n"
+    "ldr q0, [x21, x24]\n"
+    "ldr q31, [x20, x24]\n"
+    "ldr q30, [x21, x23]\n"
+    "ldr q29, [x20, x23]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "umax v22.16b, v31.16b, v30.16b\n"
-    "ldr q3, [x23, x28]\n"
-    "umax v18.16b, v29.16b, v28.16b\n"
-    "umax v21.16b, v27.16b, v21.16b\n"
-    "ldr q2, [x22, x28]\n"
-    "umax v17.16b, v26.16b, v17.16b\n"
-    "ldr q1, [x21, x28]\n"
-    "umax v20.16b, v25.16b, v20.16b\n"
-    "ldr q0, [x20, x28]\n"
-    "umax v16.16b, v24.16b, v16.16b\n"
-    "ldr q31, [x23, x27]\n"
+    "umax v23.16b, v4.16b, v3.16b\n"
+    "umax v19.16b, v28.16b, v22.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "umax v22.16b, v2.16b, v1.16b\n"
+    "ldr q2, [x21, x26]\n"
+    "umax v18.16b, v27.16b, v21.16b\n"
+    "ldr q1, [x20, x26]\n"
+    "umax v21.16b, v0.16b, v31.16b\n"
+    "ldr q0, [x21, x24]\n"
+    "umax v17.16b, v26.16b, v20.16b\n"
+    "ldr q31, [x20, x24]\n"
+    "umax v20.16b, v30.16b, v29.16b\n"
+    "ldr q30, [x21, x23]\n"
+    "umax v16.16b, v25.16b, v24.16b\n"
+    "ldr q29, [x20, x23]\n"
     "umax v19.16b, v23.16b, v19.16b\n"
-    "ldr q30, [x22, x27]\n"
     "umax v18.16b, v22.16b, v18.16b\n"
-    "ldr q29, [x21, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "umax v17.16b, v21.16b, v17.16b\n"
-    "ldr q28, [x20, x27]\n"
     "umax v16.16b, v20.16b, v16.16b\n"
-    "ldr q27, [x23, x26]\n"
-    "umax v7.16b, v7.16b, v19.16b\n"
-    "ldr q21, [x22, x26]\n"
-    "umax v6.16b, v6.16b, v18.16b\n"
-    "ldr q26, [x21, x26]\n"
-    "umax v5.16b, v5.16b, v17.16b\n"
-    "ldr q17, [x20, x26]\n"
-    "umax v4.16b, v4.16b, v16.16b\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "subs x25, x25, #0x1\n"
+    "umax v8.16b, v8.16b, v19.16b\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "umax v7.16b, v7.16b, v18.16b\n"
+    "umax v6.16b, v6.16b, v17.16b\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
+    "umax v5.16b, v5.16b, v16.16b\n"
+    "add x22, x22, #0x20\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "umax v22.16b, v31.16b, v30.16b\n"
-    "umax v18.16b, v29.16b, v28.16b\n"
-    "umax v21.16b, v27.16b, v21.16b\n"
-    "umax v17.16b, v26.16b, v17.16b\n"
-    "umax v20.16b, v25.16b, v20.16b\n"
-    "umax v16.16b, v24.16b, v16.16b\n"
+    "umax v23.16b, v4.16b, v3.16b\n"
+    "umax v19.16b, v28.16b, v22.16b\n"
+    "umax v22.16b, v2.16b, v1.16b\n"
+    "umax v18.16b, v27.16b, v21.16b\n"
+    "umax v21.16b, v0.16b, v31.16b\n"
+    "umax v17.16b, v26.16b, v20.16b\n"
+    "umax v20.16b, v30.16b, v29.16b\n"
+    "umax v16.16b, v25.16b, v24.16b\n"
     "umax v19.16b, v23.16b, v19.16b\n"
     "umax v18.16b, v22.16b, v18.16b\n"
     "umax v17.16b, v21.16b, v17.16b\n"
     "umax v16.16b, v20.16b, v16.16b\n"
-    "umax v7.16b, v7.16b, v19.16b\n"
-    "umax v6.16b, v6.16b, v18.16b\n"
-    "umax v5.16b, v5.16b, v17.16b\n"
-    "umax v4.16b, v4.16b, v16.16b\n"
+    "umax v8.16b, v8.16b, v19.16b\n"
+    "umax v7.16b, v7.16b, v18.16b\n"
+    "umax v6.16b, v6.16b, v17.16b\n"
+    "umax v5.16b, v5.16b, v16.16b\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "umax v7.16b, v7.16b, v3.16b\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "umax v6.16b, v6.16b, v31.16b\n"
-    "ldr q25, [x23, x25]\n"
-    "umax v5.16b, v5.16b, v27.16b\n"
-    "umax v4.16b, v4.16b, v25.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
+    "ldr q17, [x20, x26]\n"
+    "ldr q16, [x20, x24]\n"
+    "umax v7.16b, v7.16b, v17.16b\n"
+    "umax v6.16b, v6.16b, v16.16b\n"
+    "ldr q16, [x20, x23]\n"
+    "umax v5.16b, v5.16b, v16.16b\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x40\n"
-    "str q6, [%x[outptr], x27]\n"
-    "add x27, x27, #0x40\n"
-    "str q5, [%x[outptr], x26]\n"
-    "add x26, x26, #0x40\n"
-    "str q4, [%x[outptr], x25]\n"
-    "add x25, x25, #0x40\n"
     "sub %x[n_channels], %x[n_channels], #0x40\n"
     "cmp %x[n_channels], #0x40\n"
+    "str q8, [%x[outptr], x27]\n"
+    "str q7, [%x[outptr], x26]\n"
+    "add x27, x27, #0x40\n"
+    "add x26, x26, #0x40\n"
+    "str q6, [%x[outptr], x24]\n"
+    "add x24, x24, #0x40\n"
+    "str q5, [%x[outptr], x23]\n"
+    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 43f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x10\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
-    "movi v7.16b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "umax v19.16b, v23.16b, v19.16b\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "umax v7.16b, v7.16b, v19.16b\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "umax v17.16b, v4.16b, v3.16b\n"
+    "umax v16.16b, v28.16b, v22.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "umax v16.16b, v17.16b, v16.16b\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
+    "add x22, x22, #0x20\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "umax v19.16b, v23.16b, v19.16b\n"
-    "umax v7.16b, v7.16b, v19.16b\n"
+    "umax v17.16b, v4.16b, v3.16b\n"
+    "umax v16.16b, v28.16b, v22.16b\n"
+    "umax v16.16b, v17.16b, v16.16b\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "umax v7.16b, v7.16b, v3.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x10\n"
     "sub %x[n_channels], %x[n_channels], #0x10\n"
     "cmp %x[n_channels], #0x10\n"
+    "str q8, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 43f\n"
     "14:"  // Oddments
-    "movi v7.16b, #0x0\n"
-    "add %x[outptr], %x[outptr], x28\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 24f\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "add %x[outptr], %x[outptr], x27\n"
+    "movi v8.16b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 24f\n"
     "15:"  // Oddments: 4 inputs loop
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "add x21, x21, x27\n"
+    "movi v4.16b, #0x0\n"
     "movi v3.16b, #0x0\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "add x23, x23, x28\n"
-    "movi v2.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "movi v1.16b, #0x0\n"
-    "add x19, x19, #0x20\n"
-    "movi v0.16b, #0x0\n"
-    "add x22, x22, x28\n"
-    "add x21, x21, x28\n"
-    "add x20, x20, x28\n"
+    "add x20, x20, x27\n"
+    "movi v28.16b, #0x0\n"
+    "movi v22.16b, #0x0\n"
     "tbz %x[n_channels], #3, 19f\n"
-    "ldr d3, [x23], #0x8\n"
-    "ldr d2, [x22], #0x8\n"
-    "ldr d1, [x21], #0x8\n"
-    "ldr d0, [x20], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
+    "ldr d3, [x22], #0x8\n"
+    "ldr d28, [x21], #0x8\n"
+    "ldr d22, [x20], #0x8\n"
     "tbz %x[n_channels], #2, 17f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
-    "ld1 { v2.s }[2], [x22], #0x4\n"
-    "ld1 { v1.s }[2], [x21], #0x4\n"
-    "ld1 { v0.s }[2], [x20], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
+    "ld1 { v3.s }[2], [x22], #0x4\n"
+    "ld1 { v28.s }[2], [x21], #0x4\n"
+    "ld1 { v22.s }[2], [x20], #0x4\n"
     "tbz %x[n_channels], #1, 16f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
-    "ld1 { v2.h }[6], [x22], #0x2\n"
-    "ld1 { v1.h }[6], [x21], #0x2\n"
-    "ld1 { v0.h }[6], [x20], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
+    "ld1 { v3.h }[6], [x22], #0x2\n"
+    "ld1 { v28.h }[6], [x21], #0x2\n"
+    "ld1 { v22.h }[6], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[14], [x23], #0x1\n"
-    "ld1 { v2.b }[14], [x22], #0x1\n"
-    "ld1 { v1.b }[14], [x21], #0x1\n"
-    "ld1 { v0.b }[14], [x20], #0x1\n"
+    "ld1 { v4.b }[14], [x23], #0x1\n"
+    "ld1 { v3.b }[14], [x22], #0x1\n"
+    "ld1 { v28.b }[14], [x21], #0x1\n"
+    "ld1 { v22.b }[14], [x20], #0x1\n"
     "b 23f\n"
     "16:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[12], [x23], #0x1\n"
-    "ld1 { v2.b }[12], [x22], #0x1\n"
-    "ld1 { v1.b }[12], [x21], #0x1\n"
-    "ld1 { v0.b }[12], [x20], #0x1\n"
+    "ld1 { v4.b }[12], [x23], #0x1\n"
+    "ld1 { v3.b }[12], [x22], #0x1\n"
+    "ld1 { v28.b }[12], [x21], #0x1\n"
+    "ld1 { v22.b }[12], [x20], #0x1\n"
     "b 23f\n"
     "17:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 18f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
-    "ld1 { v2.h }[4], [x22], #0x2\n"
-    "ld1 { v1.h }[4], [x21], #0x2\n"
-    "ld1 { v0.h }[4], [x20], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
+    "ld1 { v3.h }[4], [x22], #0x2\n"
+    "ld1 { v28.h }[4], [x21], #0x2\n"
+    "ld1 { v22.h }[4], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[10], [x23], #0x1\n"
-    "ld1 { v2.b }[10], [x22], #0x1\n"
-    "ld1 { v1.b }[10], [x21], #0x1\n"
-    "ld1 { v0.b }[10], [x20], #0x1\n"
+    "ld1 { v4.b }[10], [x23], #0x1\n"
+    "ld1 { v3.b }[10], [x22], #0x1\n"
+    "ld1 { v28.b }[10], [x21], #0x1\n"
+    "ld1 { v22.b }[10], [x20], #0x1\n"
     "b 23f\n"
     "18:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[8], [x23], #0x1\n"
-    "ld1 { v2.b }[8], [x22], #0x1\n"
-    "ld1 { v1.b }[8], [x21], #0x1\n"
-    "ld1 { v0.b }[8], [x20], #0x1\n"
+    "ld1 { v4.b }[8], [x23], #0x1\n"
+    "ld1 { v3.b }[8], [x22], #0x1\n"
+    "ld1 { v28.b }[8], [x21], #0x1\n"
+    "ld1 { v22.b }[8], [x20], #0x1\n"
     "b 23f\n"
     "19:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset
     "tbz %x[n_channels], #2, 21f\n"
-    "ldr s3, [x23], #0x4\n"
-    "ldr s2, [x22], #0x4\n"
-    "ldr s1, [x21], #0x4\n"
-    "ldr s0, [x20], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
+    "ldr s3, [x22], #0x4\n"
+    "ldr s28, [x21], #0x4\n"
+    "ldr s22, [x20], #0x4\n"
     "tbz %x[n_channels], #1, 20f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
-    "ld1 { v2.h }[2], [x22], #0x2\n"
-    "ld1 { v1.h }[2], [x21], #0x2\n"
-    "ld1 { v0.h }[2], [x20], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v3.h }[2], [x22], #0x2\n"
+    "ld1 { v28.h }[2], [x21], #0x2\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[6], [x23], #0x1\n"
-    "ld1 { v2.b }[6], [x22], #0x1\n"
-    "ld1 { v1.b }[6], [x21], #0x1\n"
-    "ld1 { v0.b }[6], [x20], #0x1\n"
+    "ld1 { v4.b }[6], [x23], #0x1\n"
+    "ld1 { v3.b }[6], [x22], #0x1\n"
+    "ld1 { v28.b }[6], [x21], #0x1\n"
+    "ld1 { v22.b }[6], [x20], #0x1\n"
     "b 23f\n"
     "20:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[4], [x23], #0x1\n"
-    "ld1 { v2.b }[4], [x22], #0x1\n"
-    "ld1 { v1.b }[4], [x21], #0x1\n"
-    "ld1 { v0.b }[4], [x20], #0x1\n"
+    "ld1 { v4.b }[4], [x23], #0x1\n"
+    "ld1 { v3.b }[4], [x22], #0x1\n"
+    "ld1 { v28.b }[4], [x21], #0x1\n"
+    "ld1 { v22.b }[4], [x20], #0x1\n"
     "b 23f\n"
     "21:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 22f\n"
-    "ldr h3, [x23], #0x2\n"
-    "ldr h2, [x22], #0x2\n"
-    "ldr h1, [x21], #0x2\n"
-    "ldr h0, [x20], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
+    "ldr h3, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "ldr h22, [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[2], [x23], #0x1\n"
-    "ld1 { v2.b }[2], [x22], #0x1\n"
-    "ld1 { v1.b }[2], [x21], #0x1\n"
-    "ld1 { v0.b }[2], [x20], #0x1\n"
+    "ld1 { v4.b }[2], [x23], #0x1\n"
+    "ld1 { v3.b }[2], [x22], #0x1\n"
+    "ld1 { v28.b }[2], [x21], #0x1\n"
+    "ld1 { v22.b }[2], [x20], #0x1\n"
     "b 23f\n"
     "22:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ldr b3, [x23], #0x1\n"
-    "ldr b2, [x22], #0x1\n"
-    "ldr b1, [x21], #0x1\n"
-    "ldr b0, [x20], #0x1\n"
+    "ldr b4, [x23], #0x1\n"
+    "ldr b3, [x22], #0x1\n"
+    "ldr b28, [x21], #0x1\n"
+    "ldr b22, [x20], #0x1\n"
     "23:"  // Oddments: 4 inputs loop: Load: Bit 3: End
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "subs x24, x24, #0x1\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "umax v19.16b, v23.16b, v19.16b\n"
-    "umax v7.16b, v7.16b, v19.16b\n"
+    "umax v17.16b, v4.16b, v3.16b\n"
+    "umax v16.16b, v28.16b, v22.16b\n"
+    "subs x25, x25, #0x1\n"
+    "umax v16.16b, v17.16b, v16.16b\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
     "bgt 15b\n"
     "24:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 34f\n"
     "25:"  // Oddments: Single input loop
-    "movi v3.16b, #0x0\n"
-    "ldr x23, [x19], #0x8\n"
-    "add x23, x23, x28\n"
+    "ldr x23, [x24], #0x8\n"
+    "add x23, x23, x27\n"
+    "movi v4.16b, #0x0\n"
     "tbz %x[n_channels], #3, 29f\n"
-    "ldr d3, [x23], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
     "tbz %x[n_channels], #2, 27f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
     "tbz %x[n_channels], #1, 26f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[14], [x23], #0x1\n"
+    "ld1 { v4.b }[14], [x23], #0x1\n"
     "b 33f\n"
     "26:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[12], [x23], #0x1\n"
+    "ld1 { v4.b }[12], [x23], #0x1\n"
     "b 33f\n"
     "27:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 28f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[10], [x23], #0x1\n"
+    "ld1 { v4.b }[10], [x23], #0x1\n"
     "b 33f\n"
     "28:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[8], [x23], #0x1\n"
+    "ld1 { v4.b }[8], [x23], #0x1\n"
     "b 33f\n"
     "29:"  // Oddments: Single input loop: Load: Bit 3: Unset
     "tbz %x[n_channels], #2, 31f\n"
-    "ldr s3, [x23], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
     "tbz %x[n_channels], #1, 30f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[6], [x23], #0x1\n"
+    "ld1 { v4.b }[6], [x23], #0x1\n"
     "b 33f\n"
     "30:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[4], [x23], #0x1\n"
+    "ld1 { v4.b }[4], [x23], #0x1\n"
     "b 33f\n"
     "31:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 32f\n"
-    "ldr h3, [x23], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[2], [x23], #0x1\n"
+    "ld1 { v4.b }[2], [x23], #0x1\n"
     "b 33f\n"
     "32:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ldr b3, [x23], #0x1\n"
+    "ldr b4, [x23], #0x1\n"
     "33:"  // Oddments: Single input loop: Load: Bit 3: End
-    "umax v7.16b, v7.16b, v3.16b\n"
-    "subs x20, x20, #0x1\n"
+    "subs x21, x21, #0x1\n"
+    "umax v8.16b, v8.16b, v4.16b\n"
     "bgt 25b\n"
     "34:"  // Oddments: Single input loop: End
     "tbz %x[n_channels], #3, 38f\n"
-    "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+    "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #2, 36f\n"
-    "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
     "tbz %x[n_channels], #1, 35f\n"
-    "st1 { v7.h }[6], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[6], [%x[outptr]], #0x2\n"
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[14], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[14], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "35:"  // Oddments: Store: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[12], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[12], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "36:"  // Oddments: Store: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 37f\n"
-    "st1 { v7.h }[4], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[4], [%x[outptr]], #0x2\n"
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[10], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[10], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "37:"  // Oddments: Store: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[8], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[8], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "38:"  // Oddments: Store: Bit 3: Unset
     "tbz %x[n_channels], #2, 40f\n"
-    "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
     "tbz %x[n_channels], #1, 39f\n"
-    "st1 { v7.h }[2], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[2], [%x[outptr]], #0x2\n"
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[6], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[6], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "39:"  // Oddments: Store: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[4], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[4], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "40:"  // Oddments: Store: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 41f\n"
-    "st1 { v7.h }[0], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[0], [%x[outptr]], #0x2\n"
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[2], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[2], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "41:"  // Oddments: Store: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[0], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[0], [%x[outptr]], #0x1\n"
     "42:"  // Oddments: Store: Bit 3: End
-
     "43:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst.hpp
index d46658f080..daf836f5d6 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_u8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
 
-struct a64_u8q_nhwc_avg_generic_depthfirst
+struct a64_u8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = a64_u8q_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
   a64_u8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_u8q_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
index a57fe6df68..31a3489e5c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 
 #include "pooling.hpp"
-#include <cstddef>
 #include <cstdint>
+#include <cstddef>
 #include <cstring>
 #include <cmath>
 
@@ -87,12 +87,13 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
       f_rescale_value *= 2.0f;
     }
 
-    rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
-    if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
     {
       shift_value++;
-      rescale_value >>= 1;
+      long_rescale_value >>= 1;
     }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
   }
 
 
@@ -118,20 +119,20 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
   );
 
   __asm__ __volatile__(
-    "mov x26, #0x0\n"
-    "mov x25, #0x10\n" // cntb _, ALL, #1
-    "mov x24, #0x20\n" // cntb _, ALL, #2
-    "mov x23, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x40\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x25, #0x20\n"  // cntb _, ALL, #2
+    "mov x24, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
     "ld1r { v15.4s }, [%x[accumulator_init]]\n"
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov v14.16b, v15.16b\n"
-    "mov x19, %x[inptrs]\n"
     "mov v13.16b, v15.16b\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
     "mov v12.16b, v15.16b\n"
     "mov v11.16b, v15.16b\n"
+    "mov x22, %x[inptrs]\n"
     "mov v10.16b, v15.16b\n"
     "mov v9.16b, v15.16b\n"
     "mov v8.16b, v15.16b\n"
@@ -143,43 +144,43 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "mov v2.16b, v15.16b\n"
     "mov v1.16b, v15.16b\n"
     "mov v0.16b, v15.16b\n"
-    "cbz x22, 4f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ldr q31, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ldr q30, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
-    "ldr q29, [x21, x25]\n"
-    "ldr q28, [x20, x25]\n"
-    "ldr q27, [x21, x24]\n"
-    "ldr q26, [x20, x24]\n"
-    "ldr q25, [x21, x23]\n"
-    "ldr q24, [x20, x23]\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ldr q30, [x20, x27]\n"
+    "ldr q29, [x21, x26]\n"
+    "ldr q28, [x20, x26]\n"
+    "ldr q27, [x21, x25]\n"
+    "ldr q26, [x20, x25]\n"
+    "ldr q25, [x21, x24]\n"
+    "ldr q24, [x20, x24]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 2 inputs loop
     "uaddl v23.8h, v31.8b, v30.8b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
     "uaddl2 v22.8h, v31.16b, v30.16b\n"
-    "ldr q31, [x21, x26]\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "ldr q30, [x20, x27]\n"
     "uaddl v21.8h, v29.8b, v28.8b\n"
-    "subs x22, x22, #0x1\n"
     "uaddl2 v20.8h, v29.16b, v28.16b\n"
-    "ldr q30, [x20, x26]\n"
+    "ldr q29, [x21, x26]\n"
+    "ldr q28, [x20, x26]\n"
     "uaddl v19.8h, v27.8b, v26.8b\n"
-    "ldr q29, [x21, x25]\n"
     "uaddl2 v18.8h, v27.16b, v26.16b\n"
-    "ldr q28, [x20, x25]\n"
+    "ldr q27, [x21, x25]\n"
+    "ldr q26, [x20, x25]\n"
     "uaddl v17.8h, v25.8b, v24.8b\n"
-    "ldr q27, [x21, x24]\n"
     "uaddl2 v16.8h, v25.16b, v24.16b\n"
-    "ldr q26, [x20, x24]\n"
+    "ldr q25, [x21, x24]\n"
+    "ldr q24, [x20, x24]\n"
+    "subs x23, x23, #0x1\n"
     "uaddw v15.4s, v15.4s, v23.4h\n"
-    "ldr q25, [x21, x23]\n"
     "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "ldr q24, [x20, x23]\n"
     "uaddw v13.4s, v13.4s, v22.4h\n"
     "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "add x22, x22, #0x10\n"
     "uaddw v11.4s, v11.4s, v21.4h\n"
     "uaddw2 v10.4s, v10.4s, v21.8h\n"
     "uaddw v9.4s, v9.4s, v20.4h\n"
@@ -219,23 +220,23 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "uaddw v1.4s, v1.4s, v16.4h\n"
     "uaddw2 v0.4s, v0.4s, v16.8h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q31, [x21, x26]\n"
-    "uxtl v23.8h, v31.8b\n"
-    "ldr q29, [x21, x25]\n"
-    "uxtl2 v22.8h, v31.16b\n"
-    "ldr q27, [x21, x24]\n"
-    "ldr q25, [x21, x23]\n"
-    "uxtl v21.8h, v29.8b\n"
-    "uxtl2 v20.8h, v29.16b\n"
-    "uxtl v19.8h, v27.8b\n"
-    "uxtl2 v18.8h, v27.16b\n"
-    "uxtl v17.8h, v25.8b\n"
-    "uxtl2 v16.8h, v25.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "uxtl v23.8h, v16.8b\n"
+    "uxtl2 v22.8h, v16.16b\n"
+    "ldr q16, [x20, x26]\n"
+    "ldr q17, [x20, x25]\n"
+    "uxtl v21.8h, v16.8b\n"
+    "uxtl2 v20.8h, v16.16b\n"
+    "ldr q16, [x20, x24]\n"
+    "uxtl v19.8h, v17.8b\n"
+    "uxtl2 v18.8h, v17.16b\n"
+    "subs x23, x23, #0x1\n"
+    "uxtl v17.8h, v16.8b\n"
+    "uxtl2 v16.8h, v16.16b\n"
     "uaddw v15.4s, v15.4s, v23.4h\n"
     "uaddw2 v14.4s, v14.4s, v23.8h\n"
     "uaddw v13.4s, v13.4s, v22.4h\n"
@@ -254,64 +255,62 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "uaddw2 v0.4s, v0.4s, v16.8h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "movi v21.4s, #0x0\n"
-    "ld1r { v20.4s }, [%x[combined_rescale_value]]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    "movi v19.4s, #0xff\n"
-    "ld1r { v18.4s }, [%x[left_shift]]\n"
-    "sub %x[n_channels], %x[n_channels], #0x40\n"
-    "srshl v15.4s, v15.4s, v18.4s\n"
+    "ld1r { v19.4s }, [%x[left_shift]]\n"
+    "ld1r { v18.4s }, [%x[combined_rescale_value]]\n"
+    "srshl v15.4s, v15.4s, v19.4s\n"
+    "srshl v14.4s, v14.4s, v19.4s\n"
     "ld1r { v17.4s }, [%x[right_shift]]\n"
+    "srshl v13.4s, v13.4s, v19.4s\n"
+    "srshl v12.4s, v12.4s, v19.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "srshl v11.4s, v11.4s, v19.4s\n"
+    "srshl v10.4s, v10.4s, v19.4s\n"
+    "sub %x[n_channels], %x[n_channels], #0x40\n"
+    "srshl v9.4s, v9.4s, v19.4s\n"
+    "srshl v8.4s, v8.4s, v19.4s\n"
     "cmp %x[n_channels], #0x40\n"
-    "srshl v14.4s, v14.4s, v18.4s\n"
-    "ld1r { v16.4s }, [x19]\n"
-    "srshl v13.4s, v13.4s, v18.4s\n"
-    "srshl v12.4s, v12.4s, v18.4s\n"
-    "srshl v11.4s, v11.4s, v18.4s\n"
-    "sqrdmulh v15.4s, v15.4s, v20.4s\n"
-    "sqrdmulh v14.4s, v14.4s, v20.4s\n"
-    "sqrdmulh v13.4s, v13.4s, v20.4s\n"
-    "sqrdmulh v12.4s, v12.4s, v20.4s\n"
+    "srshl v7.4s, v7.4s, v19.4s\n"
+    "srshl v6.4s, v6.4s, v19.4s\n"
+    "srshl v5.4s, v5.4s, v19.4s\n"
+    "srshl v4.4s, v4.4s, v19.4s\n"
+    "srshl v3.4s, v3.4s, v19.4s\n"
+    "srshl v2.4s, v2.4s, v19.4s\n"
+    "srshl v1.4s, v1.4s, v19.4s\n"
+    "srshl v0.4s, v0.4s, v19.4s\n"
+    "sqrdmulh v15.4s, v15.4s, v18.4s\n"
+    "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+    "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+    "sqrdmulh v12.4s, v12.4s, v18.4s\n"
+    "sqrdmulh v11.4s, v11.4s, v18.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v18.4s\n"
+    "sqrdmulh v9.4s, v9.4s, v18.4s\n"
+    "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+    "sqrdmulh v7.4s, v7.4s, v18.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v18.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v18.4s\n"
+    "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+    "sqrdmulh v2.4s, v2.4s, v18.4s\n"
+    "sqrdmulh v1.4s, v1.4s, v18.4s\n"
+    "sqrdmulh v0.4s, v0.4s, v18.4s\n"
     "srshl v15.4s, v15.4s, v17.4s\n"
     "srshl v14.4s, v14.4s, v17.4s\n"
     "srshl v13.4s, v13.4s, v17.4s\n"
     "srshl v12.4s, v12.4s, v17.4s\n"
-    "sqrdmulh v11.4s, v11.4s, v20.4s\n"
-    "srshl v10.4s, v10.4s, v18.4s\n"
-    "srshl v9.4s, v9.4s, v18.4s\n"
-    "srshl v8.4s, v8.4s, v18.4s\n"
     "srshl v11.4s, v11.4s, v17.4s\n"
-    "sqrdmulh v10.4s, v10.4s, v20.4s\n"
-    "sqrdmulh v9.4s, v9.4s, v20.4s\n"
-    "sqrdmulh v8.4s, v8.4s, v20.4s\n"
-    "srshl v7.4s, v7.4s, v18.4s\n"
     "srshl v10.4s, v10.4s, v17.4s\n"
     "srshl v9.4s, v9.4s, v17.4s\n"
     "srshl v8.4s, v8.4s, v17.4s\n"
-    "sqrdmulh v7.4s, v7.4s, v20.4s\n"
-    "srshl v6.4s, v6.4s, v18.4s\n"
-    "srshl v5.4s, v5.4s, v18.4s\n"
-    "srshl v4.4s, v4.4s, v18.4s\n"
     "srshl v7.4s, v7.4s, v17.4s\n"
-    "sqrdmulh v6.4s, v6.4s, v20.4s\n"
-    "sqrdmulh v5.4s, v5.4s, v20.4s\n"
-    "sqrdmulh v4.4s, v4.4s, v20.4s\n"
-    "srshl v3.4s, v3.4s, v18.4s\n"
     "srshl v6.4s, v6.4s, v17.4s\n"
     "srshl v5.4s, v5.4s, v17.4s\n"
     "srshl v4.4s, v4.4s, v17.4s\n"
-    "sqrdmulh v3.4s, v3.4s, v20.4s\n"
-    "srshl v2.4s, v2.4s, v18.4s\n"
-    "srshl v1.4s, v1.4s, v18.4s\n"
-    "srshl v0.4s, v0.4s, v18.4s\n"
     "srshl v3.4s, v3.4s, v17.4s\n"
-    "sqrdmulh v2.4s, v2.4s, v20.4s\n"
-    "sqrdmulh v1.4s, v1.4s, v20.4s\n"
-    "sqrdmulh v0.4s, v0.4s, v20.4s\n"
-    "add v15.4s, v15.4s, v16.4s\n"
     "srshl v2.4s, v2.4s, v17.4s\n"
     "srshl v1.4s, v1.4s, v17.4s\n"
     "srshl v0.4s, v0.4s, v17.4s\n"
+    "add v15.4s, v15.4s, v16.4s\n"
     "add v14.4s, v14.4s, v16.4s\n"
     "add v13.4s, v13.4s, v16.4s\n"
     "add v12.4s, v12.4s, v16.4s\n"
@@ -327,58 +326,60 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "add v2.4s, v2.4s, v16.4s\n"
     "add v1.4s, v1.4s, v16.4s\n"
     "add v0.4s, v0.4s, v16.4s\n"
-    "smax v15.4s, v15.4s, v21.4s\n"
-    "smax v14.4s, v14.4s, v21.4s\n"
-    "smax v13.4s, v13.4s, v21.4s\n"
-    "smin v15.4s, v15.4s, v19.4s\n"
-    "smin v14.4s, v14.4s, v19.4s\n"
-    "smin v13.4s, v13.4s, v19.4s\n"
-    "smax v12.4s, v12.4s, v21.4s\n"
-    "smax v11.4s, v11.4s, v21.4s\n"
-    "smax v10.4s, v10.4s, v21.4s\n"
-    "smin v12.4s, v12.4s, v19.4s\n"
-    "smin v11.4s, v11.4s, v19.4s\n"
-    "smin v10.4s, v10.4s, v19.4s\n"
-    "smax v9.4s, v9.4s, v21.4s\n"
-    "smax v8.4s, v8.4s, v21.4s\n"
-    "smax v7.4s, v7.4s, v21.4s\n"
-    "smin v9.4s, v9.4s, v19.4s\n"
-    "smin v8.4s, v8.4s, v19.4s\n"
-    "smin v7.4s, v7.4s, v19.4s\n"
-    "smax v6.4s, v6.4s, v21.4s\n"
-    "smax v5.4s, v5.4s, v21.4s\n"
-    "smax v4.4s, v4.4s, v21.4s\n"
-    "smin v6.4s, v6.4s, v19.4s\n"
-    "smin v5.4s, v5.4s, v19.4s\n"
-    "smin v4.4s, v4.4s, v19.4s\n"
-    "smax v3.4s, v3.4s, v21.4s\n"
-    "smax v2.4s, v2.4s, v21.4s\n"
-    "smax v1.4s, v1.4s, v21.4s\n"
-    "smin v3.4s, v3.4s, v19.4s\n"
-    "smin v2.4s, v2.4s, v19.4s\n"
-    "smin v1.4s, v1.4s, v19.4s\n"
-    "smax v0.4s, v0.4s, v21.4s\n"
+    "movi v16.4s, #0x0\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smax v14.4s, v14.4s, v16.4s\n"
+    "smax v13.4s, v13.4s, v16.4s\n"
+    "smax v12.4s, v12.4s, v16.4s\n"
+    "smax v11.4s, v11.4s, v16.4s\n"
+    "smax v10.4s, v10.4s, v16.4s\n"
+    "smax v9.4s, v9.4s, v16.4s\n"
+    "smax v8.4s, v8.4s, v16.4s\n"
+    "smax v7.4s, v7.4s, v16.4s\n"
+    "smax v6.4s, v6.4s, v16.4s\n"
+    "smax v5.4s, v5.4s, v16.4s\n"
+    "smax v4.4s, v4.4s, v16.4s\n"
+    "smax v3.4s, v3.4s, v16.4s\n"
+    "smax v2.4s, v2.4s, v16.4s\n"
+    "smax v1.4s, v1.4s, v16.4s\n"
+    "smax v0.4s, v0.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v15.4s, v15.4s, v16.4s\n"
+    "smin v14.4s, v14.4s, v16.4s\n"
+    "smin v13.4s, v13.4s, v16.4s\n"
+    "smin v12.4s, v12.4s, v16.4s\n"
+    "smin v11.4s, v11.4s, v16.4s\n"
+    "smin v10.4s, v10.4s, v16.4s\n"
+    "smin v9.4s, v9.4s, v16.4s\n"
+    "smin v8.4s, v8.4s, v16.4s\n"
+    "smin v7.4s, v7.4s, v16.4s\n"
+    "smin v6.4s, v6.4s, v16.4s\n"
+    "smin v5.4s, v5.4s, v16.4s\n"
+    "smin v4.4s, v4.4s, v16.4s\n"
+    "smin v3.4s, v3.4s, v16.4s\n"
+    "smin v2.4s, v2.4s, v16.4s\n"
+    "smin v1.4s, v1.4s, v16.4s\n"
+    "smin v0.4s, v0.4s, v16.4s\n"
     "uzp1 v23.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "smin v0.4s, v0.4s, v19.4s\n"
     "uzp1 v22.16b, v11.16b, v10.16b\n"
-    "uzp1 v21.16b, v9.16b, v8.16b\n"
-    "uzp1 v20.16b, v7.16b, v6.16b\n"
+    "uzp1 v18.16b, v9.16b, v8.16b\n"
+    "uzp1 v21.16b, v7.16b, v6.16b\n"
     "uzp1 v17.16b, v5.16b, v4.16b\n"
-    "uzp1 v19.16b, v3.16b, v2.16b\n"
-    "uzp1 v18.16b, v1.16b, v0.16b\n"
+    "uzp1 v20.16b, v3.16b, v2.16b\n"
+    "uzp1 v19.16b, v1.16b, v0.16b\n"
     "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x26]\n"
-    "uzp1 v16.16b, v22.16b, v21.16b\n"
+    "uzp1 v18.16b, v22.16b, v18.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x40\n"
+    "uzp1 v17.16b, v21.16b, v17.16b\n"
+    "uzp1 v16.16b, v20.16b, v19.16b\n"
+    "str q18, [%x[outptr], x26]\n"
     "add x26, x26, #0x40\n"
-    "uzp1 v17.16b, v20.16b, v17.16b\n"
-    "str q16, [%x[outptr], x25]\n"
-    "uzp1 v16.16b, v19.16b, v18.16b\n"
+    "str q17, [%x[outptr], x25]\n"
     "add x25, x25, #0x40\n"
-    "str q17, [%x[outptr], x24]\n"
+    "str q16, [%x[outptr], x24]\n"
     "add x24, x24, #0x40\n"
-    "str q16, [%x[outptr], x23]\n"
-    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 43f\n"
     "7:"  // Single vector of channels
@@ -386,70 +387,68 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
     "ld1r { v15.4s }, [%x[accumulator_init]]\n"
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov v14.16b, v15.16b\n"
-    "mov x19, %x[inptrs]\n"
     "mov v13.16b, v15.16b\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
     "mov v12.16b, v15.16b\n"
-    "cbz x22, 11f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ldr q31, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ldr q30, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ldr q30, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 2 inputs loop
-    "uaddl v23.8h, v31.8b, v30.8b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
-    "uaddl2 v22.8h, v31.16b, v30.16b\n"
-    "ldr q31, [x21, x26]\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "subs x22, x22, #0x1\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "ldr q30, [x20, x26]\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "uaddl v17.8h, v31.8b, v30.8b\n"
+    "uaddl2 v16.8h, v31.16b, v30.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "ldr q30, [x20, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
+    "add x22, x22, #0x10\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 2 inputs tail
-    "uaddl v23.8h, v31.8b, v30.8b\n"
-    "uaddl2 v22.8h, v31.16b, v30.16b\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "uaddl v17.8h, v31.8b, v30.8b\n"
+    "uaddl2 v16.8h, v31.16b, v30.16b\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q31, [x21, x26]\n"
-    "uxtl v23.8h, v31.8b\n"
-    "uxtl2 v22.8h, v31.16b\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "uxtl v17.8h, v16.8b\n"
+    "uxtl2 v16.8h, v16.16b\n"
+    "subs x23, x23, #0x1\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "movi v21.4s, #0x0\n"
-    "ld1r { v20.4s }, [%x[combined_rescale_value]]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    "movi v19.4s, #0xff\n"
-    "ld1r { v18.4s }, [%x[left_shift]]\n"
-    "sub %x[n_channels], %x[n_channels], #0x10\n"
-    "srshl v15.4s, v15.4s, v18.4s\n"
+    "ld1r { v16.4s }, [%x[left_shift]]\n"
+    "ld1r { v18.4s }, [%x[combined_rescale_value]]\n"
+    "srshl v15.4s, v15.4s, v16.4s\n"
+    "srshl v14.4s, v14.4s, v16.4s\n"
     "ld1r { v17.4s }, [%x[right_shift]]\n"
+    "srshl v13.4s, v13.4s, v16.4s\n"
+    "srshl v12.4s, v12.4s, v16.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "sqrdmulh v15.4s, v15.4s, v18.4s\n"
+    "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+    "sub %x[n_channels], %x[n_channels], #0x10\n"
+    "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+    "sqrdmulh v12.4s, v12.4s, v18.4s\n"
     "cmp %x[n_channels], #0x10\n"
-    "srshl v14.4s, v14.4s, v18.4s\n"
-    "ld1r { v16.4s }, [x19]\n"
-    "srshl v13.4s, v13.4s, v18.4s\n"
-    "srshl v12.4s, v12.4s, v18.4s\n"
-    "sqrdmulh v15.4s, v15.4s, v20.4s\n"
-    "sqrdmulh v14.4s, v14.4s, v20.4s\n"
-    "sqrdmulh v13.4s, v13.4s, v20.4s\n"
-    "sqrdmulh v12.4s, v12.4s, v20.4s\n"
     "srshl v15.4s, v15.4s, v17.4s\n"
     "srshl v14.4s, v14.4s, v17.4s\n"
     "srshl v13.4s, v13.4s, v17.4s\n"
@@ -458,37 +457,39 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "add v14.4s, v14.4s, v16.4s\n"
     "add v13.4s, v13.4s, v16.4s\n"
     "add v12.4s, v12.4s, v16.4s\n"
-    "smax v15.4s, v15.4s, v21.4s\n"
-    "smax v14.4s, v14.4s, v21.4s\n"
-    "smax v13.4s, v13.4s, v21.4s\n"
-    "smin v15.4s, v15.4s, v19.4s\n"
-    "smin v14.4s, v14.4s, v19.4s\n"
-    "smin v13.4s, v13.4s, v19.4s\n"
-    "smax v12.4s, v12.4s, v21.4s\n"
-    "uzp1 v23.16b, v15.16b, v14.16b\n"
-    "smin v12.4s, v12.4s, v19.4s\n"
+    "movi v16.4s, #0x0\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smax v14.4s, v14.4s, v16.4s\n"
+    "smax v13.4s, v13.4s, v16.4s\n"
+    "smax v12.4s, v12.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v15.4s, v15.4s, v16.4s\n"
+    "smin v14.4s, v14.4s, v16.4s\n"
+    "smin v13.4s, v13.4s, v16.4s\n"
+    "smin v12.4s, v12.4s, v16.4s\n"
+    "uzp1 v17.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x26]\n"
-    "add x26, x26, #0x10\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 43f\n"
     "14:"  // Oddments
     "ld1r { v15.4s }, [%x[accumulator_init]]\n"
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "add %x[outptr], %x[outptr], x27\n"
     "mov v14.16b, v15.16b\n"
-    "add %x[outptr], %x[outptr], x26\n"
     "mov v13.16b, v15.16b\n"
-    "mov x19, %x[inptrs]\n"
     "mov v12.16b, v15.16b\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
-    "cbz x22, 24f\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x23, 24f\n"
     "15:"  // Oddments: 2 inputs loop
+    "ldp x21, x20, [x22, #0x0]\n"
+    "add x22, x22, #0x10\n"
+    "add x21, x21, x27\n"
     "movi v31.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
+    "add x20, x20, x27\n"
     "movi v30.16b, #0x0\n"
-    "add x21, x21, x26\n"
-    "add x20, x20, x26\n"
     "tbz %x[n_channels], #3, 19f\n"
     "ldr d31, [x21], #0x8\n"
     "ldr d30, [x20], #0x8\n"
@@ -549,21 +550,21 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "ldr b31, [x21], #0x1\n"
     "ldr b30, [x20], #0x1\n"
     "23:"  // Oddments: 2 inputs loop: Load: Bit 3: End
-    "uaddl v23.8h, v31.8b, v30.8b\n"
-    "subs x22, x22, #0x1\n"
-    "uaddl2 v22.8h, v31.16b, v30.16b\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "uaddl v17.8h, v31.8b, v30.8b\n"
+    "uaddl2 v16.8h, v31.16b, v30.16b\n"
+    "subs x23, x23, #0x1\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 15b\n"
     "24:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 34f\n"
     "25:"  // Oddments: Single input loop
+    "ldr x21, [x22], #0x8\n"
+    "add x21, x21, x27\n"
     "movi v31.16b, #0x0\n"
-    "ldr x21, [x19], #0x8\n"
-    "add x21, x21, x26\n"
     "tbz %x[n_channels], #3, 29f\n"
     "ldr d31, [x21], #0x8\n"
     "tbz %x[n_channels], #2, 27f\n"
@@ -609,30 +610,28 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 33f\n"
     "ldr b31, [x21], #0x1\n"
     "33:"  // Oddments: Single input loop: Load: Bit 3: End
-    "uxtl v23.8h, v31.8b\n"
-    "subs x20, x20, #0x1\n"
-    "uxtl2 v22.8h, v31.16b\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "uxtl v17.8h, v31.8b\n"
+    "uxtl2 v16.8h, v31.16b\n"
+    "subs x23, x23, #0x1\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 25b\n"
     "34:"  // Oddments: Single input loop: End
-    "movi v21.4s, #0x0\n"
-    "ld1r { v20.4s }, [%x[combined_rescale_value]]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    "movi v19.4s, #0xff\n"
-    "ld1r { v18.4s }, [%x[left_shift]]\n"
+    "ld1r { v16.4s }, [%x[left_shift]]\n"
+    "ld1r { v18.4s }, [%x[combined_rescale_value]]\n"
+    "srshl v15.4s, v15.4s, v16.4s\n"
+    "srshl v14.4s, v14.4s, v16.4s\n"
     "ld1r { v17.4s }, [%x[right_shift]]\n"
-    "srshl v15.4s, v15.4s, v18.4s\n"
-    "ld1r { v16.4s }, [x19]\n"
-    "srshl v14.4s, v14.4s, v18.4s\n"
-    "srshl v13.4s, v13.4s, v18.4s\n"
-    "srshl v12.4s, v12.4s, v18.4s\n"
-    "sqrdmulh v15.4s, v15.4s, v20.4s\n"
-    "sqrdmulh v14.4s, v14.4s, v20.4s\n"
-    "sqrdmulh v13.4s, v13.4s, v20.4s\n"
-    "sqrdmulh v12.4s, v12.4s, v20.4s\n"
+    "srshl v13.4s, v13.4s, v16.4s\n"
+    "srshl v12.4s, v12.4s, v16.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "sqrdmulh v15.4s, v15.4s, v18.4s\n"
+    "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+    "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+    "sqrdmulh v12.4s, v12.4s, v18.4s\n"
     "srshl v15.4s, v15.4s, v17.4s\n"
     "srshl v14.4s, v14.4s, v17.4s\n"
     "srshl v13.4s, v13.4s, v17.4s\n"
@@ -641,17 +640,19 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "add v14.4s, v14.4s, v16.4s\n"
     "add v13.4s, v13.4s, v16.4s\n"
     "add v12.4s, v12.4s, v16.4s\n"
-    "smax v15.4s, v15.4s, v21.4s\n"
-    "smax v14.4s, v14.4s, v21.4s\n"
-    "smax v13.4s, v13.4s, v21.4s\n"
-    "smin v15.4s, v15.4s, v19.4s\n"
-    "smin v14.4s, v14.4s, v19.4s\n"
-    "smin v13.4s, v13.4s, v19.4s\n"
-    "smax v12.4s, v12.4s, v21.4s\n"
-    "uzp1 v23.16b, v15.16b, v14.16b\n"
-    "smin v12.4s, v12.4s, v19.4s\n"
+    "movi v16.4s, #0x0\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smax v14.4s, v14.4s, v16.4s\n"
+    "smax v13.4s, v13.4s, v16.4s\n"
+    "smax v12.4s, v12.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v15.4s, v15.4s, v16.4s\n"
+    "smin v14.4s, v14.4s, v16.4s\n"
+    "smin v13.4s, v13.4s, v16.4s\n"
+    "smin v12.4s, v12.4s, v16.4s\n"
+    "uzp1 v17.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
     "tbz %x[n_channels], #3, 38f\n"
     "st1 { v16.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #2, 36f\n"
@@ -697,12 +698,10 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 42f\n"
     "st1 { v16.b }[0], [%x[outptr]], #0x1\n"
     "42:"  // Oddments: Store: Bit 3: End
-
     "43:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [accumulator_init] "r" (&accumulator_init), [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [quant_params] "r" (&qp), [right_shift] "r" (&right_shift)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst.hpp
index 1b97b458c0..fa9600f83d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_u8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
 
-struct a64_u8q_nhwc_max_generic_depthfirst
+struct a64_u8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = a64_u8q_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
   a64_u8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_u8q_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
index 0d196e097e..f4927c5536 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 
 #include "pooling.hpp"
-#include <cstddef>
 #include <cstdint>
+#include <cstddef>
 
 #if defined(__aarch64__)
 
@@ -42,583 +42,583 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "mov x28, #0x0\n"
-    "mov x27, #0x10\n" // cntb _, ALL, #1
-    "mov x26, #0x20\n" // cntb _, ALL, #2
-    "mov x25, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x40\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x24, #0x20\n"  // cntb _, ALL, #2
+    "mov x23, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
-    "movi v4.16b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
     "movi v8.16b, #0x0\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "movi v7.16b, #0x0\n"
+    "mov x22, %x[inptrs]\n"
     "movi v6.16b, #0x0\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q30, [x22, x27]\n"
-    "ldr q29, [x21, x27]\n"
-    "ldr q28, [x20, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "ldr q21, [x22, x26]\n"
-    "ldr q26, [x21, x26]\n"
-    "ldr q17, [x20, x26]\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "movi v5.16b, #0x0\n"
+    "cbz x25, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldr q2, [x21, x26]\n"
+    "ldr q1, [x20, x26]\n"
+    "ldr q0, [x21, x24]\n"
+    "ldr q31, [x20, x24]\n"
+    "ldr q30, [x21, x23]\n"
+    "ldr q29, [x20, x23]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "umax v22.16b, v31.16b, v30.16b\n"
-    "ldr q3, [x23, x28]\n"
-    "umax v18.16b, v29.16b, v28.16b\n"
-    "umax v21.16b, v27.16b, v21.16b\n"
-    "ldr q2, [x22, x28]\n"
-    "umax v17.16b, v26.16b, v17.16b\n"
-    "ldr q1, [x21, x28]\n"
-    "umax v20.16b, v25.16b, v20.16b\n"
-    "ldr q0, [x20, x28]\n"
-    "umax v16.16b, v24.16b, v16.16b\n"
-    "ldr q31, [x23, x27]\n"
+    "umax v23.16b, v4.16b, v3.16b\n"
+    "umax v19.16b, v28.16b, v22.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "umax v22.16b, v2.16b, v1.16b\n"
+    "ldr q2, [x21, x26]\n"
+    "umax v18.16b, v27.16b, v21.16b\n"
+    "ldr q1, [x20, x26]\n"
+    "umax v21.16b, v0.16b, v31.16b\n"
+    "ldr q0, [x21, x24]\n"
+    "umax v17.16b, v26.16b, v20.16b\n"
+    "ldr q31, [x20, x24]\n"
+    "umax v20.16b, v30.16b, v29.16b\n"
+    "ldr q30, [x21, x23]\n"
+    "umax v16.16b, v25.16b, v24.16b\n"
+    "ldr q29, [x20, x23]\n"
     "umax v19.16b, v23.16b, v19.16b\n"
-    "ldr q30, [x22, x27]\n"
     "umax v18.16b, v22.16b, v18.16b\n"
-    "ldr q29, [x21, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "umax v17.16b, v21.16b, v17.16b\n"
-    "ldr q28, [x20, x27]\n"
     "umax v16.16b, v20.16b, v16.16b\n"
-    "ldr q27, [x23, x26]\n"
-    "umax v4.16b, v4.16b, v19.16b\n"
-    "ldr q21, [x22, x26]\n"
-    "umax v8.16b, v8.16b, v18.16b\n"
-    "ldr q26, [x21, x26]\n"
-    "umax v7.16b, v7.16b, v17.16b\n"
-    "ldr q17, [x20, x26]\n"
-    "umax v6.16b, v6.16b, v16.16b\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "subs x25, x25, #0x1\n"
+    "umax v8.16b, v8.16b, v19.16b\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "umax v7.16b, v7.16b, v18.16b\n"
+    "umax v6.16b, v6.16b, v17.16b\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
+    "umax v5.16b, v5.16b, v16.16b\n"
+    "add x22, x22, #0x20\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "umax v22.16b, v31.16b, v30.16b\n"
-    "umax v18.16b, v29.16b, v28.16b\n"
-    "umax v21.16b, v27.16b, v21.16b\n"
-    "umax v17.16b, v26.16b, v17.16b\n"
-    "umax v20.16b, v25.16b, v20.16b\n"
-    "umax v16.16b, v24.16b, v16.16b\n"
+    "umax v23.16b, v4.16b, v3.16b\n"
+    "umax v19.16b, v28.16b, v22.16b\n"
+    "umax v22.16b, v2.16b, v1.16b\n"
+    "umax v18.16b, v27.16b, v21.16b\n"
+    "umax v21.16b, v0.16b, v31.16b\n"
+    "umax v17.16b, v26.16b, v20.16b\n"
+    "umax v20.16b, v30.16b, v29.16b\n"
+    "umax v16.16b, v25.16b, v24.16b\n"
     "umax v19.16b, v23.16b, v19.16b\n"
     "umax v18.16b, v22.16b, v18.16b\n"
     "umax v17.16b, v21.16b, v17.16b\n"
     "umax v16.16b, v20.16b, v16.16b\n"
-    "umax v4.16b, v4.16b, v19.16b\n"
-    "umax v8.16b, v8.16b, v18.16b\n"
-    "umax v7.16b, v7.16b, v17.16b\n"
-    "umax v6.16b, v6.16b, v16.16b\n"
+    "umax v8.16b, v8.16b, v19.16b\n"
+    "umax v7.16b, v7.16b, v18.16b\n"
+    "umax v6.16b, v6.16b, v17.16b\n"
+    "umax v5.16b, v5.16b, v16.16b\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "umax v4.16b, v4.16b, v3.16b\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "umax v8.16b, v8.16b, v31.16b\n"
-    "ldr q25, [x23, x25]\n"
-    "umax v7.16b, v7.16b, v27.16b\n"
-    "umax v6.16b, v6.16b, v25.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
+    "ldr q17, [x20, x26]\n"
+    "ldr q16, [x20, x24]\n"
+    "umax v7.16b, v7.16b, v17.16b\n"
+    "umax v6.16b, v6.16b, v16.16b\n"
+    "ldr q16, [x20, x23]\n"
+    "umax v5.16b, v5.16b, v16.16b\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "uxtl v17.8h, v4.8b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_input_offset]\n"
-    "ld1r { v5.4s }, [x19]\n"
-    "uxtl2 v16.8h, v4.16b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    "uxtl v21.8h, v8.8b\n"
-    "ld1r { v4.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
-    "uxtl2 v20.8h, v8.16b\n"
-    "ld1r { v3.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    "uxtl v19.8h, v7.8b\n"
-    "ld1r { v2.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    "uxtl2 v24.8h, v7.16b\n"
-    "ld1r { v1.4s }, [x19]\n"
+    "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+    "ld1r { v4.4s }, [x20]\n"
+    "uxtl v23.8h, v8.8b\n"
+    "uxtl2 v24.8h, v8.16b\n"
+    "uxtl v22.8h, v7.8b\n"
+    "uxtl2 v21.8h, v7.16b\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1r { v3.4s }, [x20]\n"
+    "uxtl v20.8h, v6.8b\n"
+    "uxtl2 v17.8h, v6.16b\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1r { v2.4s }, [x20]\n"
+    "uxtl v19.8h, v5.8b\n"
+    "uxtl2 v18.8h, v5.16b\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1r { v1.4s }, [x20]\n"
+    "neg v4.4s, v4.4s\n"
+    "saddw v0.4s, v4.4s, v23.4h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "saddw2 v23.4s, v4.4s, v23.8h\n"
+    "saddw v31.4s, v4.4s, v24.4h\n"
     "sub %x[n_channels], %x[n_channels], #0x40\n"
-    "uxtl v0.8h, v6.8b\n"
     "cmp %x[n_channels], #0x40\n"
-    "uxtl2 v31.8h, v6.16b\n"
-    "neg v5.4s, v5.4s\n"
-    "movi v30.4s, #0x0\n"
-    "movi v29.4s, #0xff\n"
-    "saddw v23.4s, v5.4s, v17.4h\n"
-    "saddw2 v18.4s, v5.4s, v17.8h\n"
-    "saddw v17.4s, v5.4s, v16.4h\n"
-    "saddw2 v16.4s, v5.4s, v16.8h\n"
-    "saddw v22.4s, v5.4s, v21.4h\n"
-    "saddw2 v21.4s, v5.4s, v21.8h\n"
-    "saddw v28.4s, v5.4s, v20.4h\n"
-    "saddw2 v20.4s, v5.4s, v20.8h\n"
-    "saddw v27.4s, v5.4s, v19.4h\n"
-    "saddw2 v19.4s, v5.4s, v19.8h\n"
+    "saddw2 v30.4s, v4.4s, v24.8h\n"
+    "saddw v29.4s, v4.4s, v22.4h\n"
+    "saddw2 v22.4s, v4.4s, v22.8h\n"
+    "saddw v28.4s, v4.4s, v21.4h\n"
+    "saddw2 v21.4s, v4.4s, v21.8h\n"
+    "saddw v27.4s, v4.4s, v20.4h\n"
+    "saddw2 v20.4s, v4.4s, v20.8h\n"
+    "saddw v26.4s, v4.4s, v17.4h\n"
+    "saddw2 v17.4s, v4.4s, v17.8h\n"
+    "saddw v25.4s, v4.4s, v19.4h\n"
+    "saddw2 v19.4s, v4.4s, v19.8h\n"
+    "saddw v24.4s, v4.4s, v18.4h\n"
+    "saddw2 v18.4s, v4.4s, v18.8h\n"
+    "srshl v0.4s, v0.4s, v3.4s\n"
     "srshl v23.4s, v23.4s, v3.4s\n"
-    "srshl v18.4s, v18.4s, v3.4s\n"
-    "srshl v17.4s, v17.4s, v3.4s\n"
-    "srshl v16.4s, v16.4s, v3.4s\n"
-    "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-    "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-    "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-    "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-    "srshl v23.4s, v23.4s, v2.4s\n"
-    "srshl v18.4s, v18.4s, v2.4s\n"
-    "srshl v17.4s, v17.4s, v2.4s\n"
-    "srshl v16.4s, v16.4s, v2.4s\n"
+    "srshl v31.4s, v31.4s, v3.4s\n"
+    "srshl v30.4s, v30.4s, v3.4s\n"
+    "srshl v29.4s, v29.4s, v3.4s\n"
     "srshl v22.4s, v22.4s, v3.4s\n"
-    "srshl v21.4s, v21.4s, v3.4s\n"
     "srshl v28.4s, v28.4s, v3.4s\n"
-    "srshl v20.4s, v20.4s, v3.4s\n"
-    "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-    "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-    "sqrdmulh v28.4s, v28.4s, v4.4s\n"
-    "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-    "srshl v22.4s, v22.4s, v2.4s\n"
-    "srshl v21.4s, v21.4s, v2.4s\n"
-    "srshl v28.4s, v28.4s, v2.4s\n"
-    "srshl v20.4s, v20.4s, v2.4s\n"
+    "srshl v21.4s, v21.4s, v3.4s\n"
     "srshl v27.4s, v27.4s, v3.4s\n"
-    "srshl v19.4s, v19.4s, v3.4s\n"
-    "add v23.4s, v23.4s, v1.4s\n"
-    "add v18.4s, v18.4s, v1.4s\n"
-    "sqrdmulh v27.4s, v27.4s, v4.4s\n"
-    "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-    "add v17.4s, v17.4s, v1.4s\n"
-    "add v16.4s, v16.4s, v1.4s\n"
-    "srshl v27.4s, v27.4s, v2.4s\n"
-    "srshl v19.4s, v19.4s, v2.4s\n"
-    "add v22.4s, v22.4s, v1.4s\n"
-    "add v21.4s, v21.4s, v1.4s\n"
-    "add v28.4s, v28.4s, v1.4s\n"
-    "add v20.4s, v20.4s, v1.4s\n"
-    "add v27.4s, v27.4s, v1.4s\n"
-    "add v19.4s, v19.4s, v1.4s\n"
-    "smax v23.4s, v23.4s, v30.4s\n"
-    "smax v18.4s, v18.4s, v30.4s\n"
-    "smax v17.4s, v17.4s, v30.4s\n"
-    "smin v23.4s, v23.4s, v29.4s\n"
-    "smin v18.4s, v18.4s, v29.4s\n"
-    "smin v17.4s, v17.4s, v29.4s\n"
-    "smax v16.4s, v16.4s, v30.4s\n"
-    "smax v22.4s, v22.4s, v30.4s\n"
-    "smax v21.4s, v21.4s, v30.4s\n"
-    "smin v16.4s, v16.4s, v29.4s\n"
-    "smin v22.4s, v22.4s, v29.4s\n"
-    "smin v21.4s, v21.4s, v29.4s\n"
-    "smax v28.4s, v28.4s, v30.4s\n"
-    "smax v20.4s, v20.4s, v30.4s\n"
-    "smax v27.4s, v27.4s, v30.4s\n"
-    "smin v28.4s, v28.4s, v29.4s\n"
-    "smin v20.4s, v20.4s, v29.4s\n"
-    "smin v27.4s, v27.4s, v29.4s\n"
-    "smax v19.4s, v19.4s, v30.4s\n"
-    "uzp1 v26.16b, v23.16b, v18.16b\n"
-    "saddw v25.4s, v5.4s, v24.4h\n"
-    "saddw2 v18.4s, v5.4s, v24.8h\n"
-    "smin v19.4s, v19.4s, v29.4s\n"
+    "srshl v20.4s, v20.4s, v3.4s\n"
+    "srshl v26.4s, v26.4s, v3.4s\n"
+    "srshl v17.4s, v17.4s, v3.4s\n"
     "srshl v25.4s, v25.4s, v3.4s\n"
+    "srshl v19.4s, v19.4s, v3.4s\n"
+    "srshl v24.4s, v24.4s, v3.4s\n"
     "srshl v18.4s, v18.4s, v3.4s\n"
-    "uzp1 v24.16b, v17.16b, v16.16b\n"
-    "saddw v17.4s, v5.4s, v0.4h\n"
-    "saddw2 v16.4s, v5.4s, v0.8h\n"
-    "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-    "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-    "srshl v17.4s, v17.4s, v3.4s\n"
-    "srshl v16.4s, v16.4s, v3.4s\n"
-    "srshl v25.4s, v25.4s, v2.4s\n"
-    "srshl v18.4s, v18.4s, v2.4s\n"
-    "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-    "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-    "add v25.4s, v25.4s, v1.4s\n"
-    "add v18.4s, v18.4s, v1.4s\n"
-    "srshl v17.4s, v17.4s, v2.4s\n"
-    "srshl v16.4s, v16.4s, v2.4s\n"
-    "smax v25.4s, v25.4s, v30.4s\n"
-    "smax v18.4s, v18.4s, v30.4s\n"
-    "add v17.4s, v17.4s, v1.4s\n"
-    "add v16.4s, v16.4s, v1.4s\n"
-    "smin v25.4s, v25.4s, v29.4s\n"
-    "smin v18.4s, v18.4s, v29.4s\n"
-    "smax v17.4s, v17.4s, v30.4s\n"
-    "smax v16.4s, v16.4s, v30.4s\n"
-    "uzp1 v23.16b, v22.16b, v21.16b\n"
-    "saddw v22.4s, v5.4s, v31.4h\n"
-    "saddw2 v21.4s, v5.4s, v31.8h\n"
-    "smin v17.4s, v17.4s, v29.4s\n"
-    "srshl v22.4s, v22.4s, v3.4s\n"
-    "srshl v21.4s, v21.4s, v3.4s\n"
-    "smin v16.4s, v16.4s, v29.4s\n"
-    "uzp1 v20.16b, v28.16b, v20.16b\n"
-    "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-    "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-    "uzp1 v19.16b, v27.16b, v19.16b\n"
-    "uzp1 v18.16b, v25.16b, v18.16b\n"
-    "srshl v22.4s, v22.4s, v2.4s\n"
-    "srshl v21.4s, v21.4s, v2.4s\n"
-    "uzp1 v17.16b, v17.16b, v16.16b\n"
-    "uzp1 v16.16b, v26.16b, v24.16b\n"
-    "str q16, [%x[outptr], x28]\n"
-    "add v22.4s, v22.4s, v1.4s\n"
-    "add x28, x28, #0x40\n"
-    "add v21.4s, v21.4s, v1.4s\n"
-    "uzp1 v16.16b, v23.16b, v20.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v2.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v2.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v2.4s\n"
+    "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v2.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v2.4s\n"
+    "srshl v0.4s, v0.4s, v1.4s\n"
+    "srshl v23.4s, v23.4s, v1.4s\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "srshl v30.4s, v30.4s, v1.4s\n"
+    "srshl v29.4s, v29.4s, v1.4s\n"
+    "srshl v22.4s, v22.4s, v1.4s\n"
+    "srshl v28.4s, v28.4s, v1.4s\n"
+    "srshl v21.4s, v21.4s, v1.4s\n"
+    "srshl v27.4s, v27.4s, v1.4s\n"
+    "srshl v20.4s, v20.4s, v1.4s\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "srshl v17.4s, v17.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v19.4s, v19.4s, v1.4s\n"
+    "srshl v24.4s, v24.4s, v1.4s\n"
+    "srshl v18.4s, v18.4s, v1.4s\n"
+    "add v0.4s, v0.4s, v16.4s\n"
+    "add v23.4s, v23.4s, v16.4s\n"
+    "add v31.4s, v31.4s, v16.4s\n"
+    "add v30.4s, v30.4s, v16.4s\n"
+    "add v29.4s, v29.4s, v16.4s\n"
+    "add v22.4s, v22.4s, v16.4s\n"
+    "add v28.4s, v28.4s, v16.4s\n"
+    "add v21.4s, v21.4s, v16.4s\n"
+    "add v27.4s, v27.4s, v16.4s\n"
+    "add v20.4s, v20.4s, v16.4s\n"
+    "add v26.4s, v26.4s, v16.4s\n"
+    "add v17.4s, v17.4s, v16.4s\n"
+    "add v25.4s, v25.4s, v16.4s\n"
+    "add v19.4s, v19.4s, v16.4s\n"
+    "add v24.4s, v24.4s, v16.4s\n"
+    "add v18.4s, v18.4s, v16.4s\n"
+    "movi v16.4s, #0x0\n"
+    "smax v0.4s, v0.4s, v16.4s\n"
+    "smax v23.4s, v23.4s, v16.4s\n"
+    "smax v31.4s, v31.4s, v16.4s\n"
+    "smax v30.4s, v30.4s, v16.4s\n"
+    "smax v29.4s, v29.4s, v16.4s\n"
+    "smax v22.4s, v22.4s, v16.4s\n"
+    "smax v28.4s, v28.4s, v16.4s\n"
+    "smax v21.4s, v21.4s, v16.4s\n"
+    "smax v27.4s, v27.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "smax v26.4s, v26.4s, v16.4s\n"
+    "smax v17.4s, v17.4s, v16.4s\n"
+    "smax v25.4s, v25.4s, v16.4s\n"
+    "smax v19.4s, v19.4s, v16.4s\n"
+    "smax v24.4s, v24.4s, v16.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v0.4s, v0.4s, v16.4s\n"
+    "smin v23.4s, v23.4s, v16.4s\n"
+    "smin v31.4s, v31.4s, v16.4s\n"
+    "smin v30.4s, v30.4s, v16.4s\n"
+    "smin v29.4s, v29.4s, v16.4s\n"
+    "smin v22.4s, v22.4s, v16.4s\n"
+    "smin v28.4s, v28.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v16.4s\n"
+    "smin v27.4s, v27.4s, v16.4s\n"
+    "smin v20.4s, v20.4s, v16.4s\n"
+    "smin v26.4s, v26.4s, v16.4s\n"
+    "smin v17.4s, v17.4s, v16.4s\n"
+    "smin v25.4s, v25.4s, v16.4s\n"
+    "smin v19.4s, v19.4s, v16.4s\n"
+    "smin v24.4s, v24.4s, v16.4s\n"
+    "smin v18.4s, v18.4s, v16.4s\n"
+    "uzp1 v23.16b, v0.16b, v23.16b\n"
+    "uzp1 v16.16b, v31.16b, v30.16b\n"
+    "uzp1 v22.16b, v29.16b, v22.16b\n"
+    "uzp1 v21.16b, v28.16b, v21.16b\n"
+    "uzp1 v20.16b, v27.16b, v20.16b\n"
+    "uzp1 v17.16b, v26.16b, v17.16b\n"
+    "uzp1 v19.16b, v25.16b, v19.16b\n"
+    "uzp1 v18.16b, v24.16b, v18.16b\n"
+    "uzp1 v16.16b, v23.16b, v16.16b\n"
     "str q16, [%x[outptr], x27]\n"
-    "smax v22.4s, v22.4s, v30.4s\n"
     "add x27, x27, #0x40\n"
-    "smax v21.4s, v21.4s, v30.4s\n"
-    "uzp1 v16.16b, v19.16b, v18.16b\n"
+    "uzp1 v16.16b, v22.16b, v21.16b\n"
+    "uzp1 v17.16b, v20.16b, v17.16b\n"
     "str q16, [%x[outptr], x26]\n"
-    "smin v22.4s, v22.4s, v29.4s\n"
     "add x26, x26, #0x40\n"
-    "smin v21.4s, v21.4s, v29.4s\n"
-    "uzp1 v16.16b, v22.16b, v21.16b\n"
-    "uzp1 v16.16b, v17.16b, v16.16b\n"
-    "str q16, [%x[outptr], x25]\n"
-    "add x25, x25, #0x40\n"
+    "uzp1 v16.16b, v19.16b, v18.16b\n"
+    "str q17, [%x[outptr], x24]\n"
+    "add x24, x24, #0x40\n"
+    "str q16, [%x[outptr], x23]\n"
+    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 43f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x10\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
-    "movi v4.16b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "umax v19.16b, v23.16b, v19.16b\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "umax v4.16b, v4.16b, v19.16b\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "umax v17.16b, v4.16b, v3.16b\n"
+    "umax v16.16b, v28.16b, v22.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "umax v16.16b, v17.16b, v16.16b\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
+    "add x22, x22, #0x20\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "umax v19.16b, v23.16b, v19.16b\n"
-    "umax v4.16b, v4.16b, v19.16b\n"
+    "umax v17.16b, v4.16b, v3.16b\n"
+    "umax v16.16b, v28.16b, v22.16b\n"
+    "umax v16.16b, v17.16b, v16.16b\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "umax v4.16b, v4.16b, v3.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "uxtl v17.8h, v4.8b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_input_offset]\n"
-    "ld1r { v5.4s }, [x19]\n"
-    "uxtl2 v16.8h, v4.16b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    "movi v30.4s, #0x0\n"
-    "ld1r { v4.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
-    "movi v29.4s, #0xff\n"
-    "ld1r { v3.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    "neg v5.4s, v5.4s\n"
-    "ld1r { v2.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    "saddw v23.4s, v5.4s, v17.4h\n"
-    "ld1r { v1.4s }, [x19]\n"
+    "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+    "ld1r { v18.4s }, [x20]\n"
+    "uxtl v17.8h, v8.8b\n"
+    "uxtl2 v16.8h, v8.16b\n"
+    "neg v18.4s, v18.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1r { v23.4s }, [x20]\n"
+    "saddw v22.4s, v18.4s, v17.4h\n"
+    "saddw2 v21.4s, v18.4s, v17.8h\n"
+    "saddw v20.4s, v18.4s, v16.4h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1r { v19.4s }, [x20]\n"
+    "saddw2 v18.4s, v18.4s, v16.8h\n"
+    "srshl v22.4s, v22.4s, v23.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1r { v17.4s }, [x20]\n"
+    "srshl v21.4s, v21.4s, v23.4s\n"
+    "srshl v20.4s, v20.4s, v23.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "srshl v18.4s, v18.4s, v23.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v19.4s\n"
     "sub %x[n_channels], %x[n_channels], #0x10\n"
-    "saddw2 v18.4s, v5.4s, v17.8h\n"
     "cmp %x[n_channels], #0x10\n"
-    "saddw v17.4s, v5.4s, v16.4h\n"
-    "saddw2 v16.4s, v5.4s, v16.8h\n"
-    "srshl v23.4s, v23.4s, v3.4s\n"
-    "srshl v18.4s, v18.4s, v3.4s\n"
-    "srshl v17.4s, v17.4s, v3.4s\n"
-    "srshl v16.4s, v16.4s, v3.4s\n"
-    "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-    "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-    "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-    "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-    "srshl v23.4s, v23.4s, v2.4s\n"
-    "srshl v18.4s, v18.4s, v2.4s\n"
-    "srshl v17.4s, v17.4s, v2.4s\n"
-    "srshl v16.4s, v16.4s, v2.4s\n"
-    "add v23.4s, v23.4s, v1.4s\n"
-    "add v18.4s, v18.4s, v1.4s\n"
-    "add v17.4s, v17.4s, v1.4s\n"
-    "add v16.4s, v16.4s, v1.4s\n"
-    "smax v23.4s, v23.4s, v30.4s\n"
-    "smax v18.4s, v18.4s, v30.4s\n"
-    "smax v17.4s, v17.4s, v30.4s\n"
-    "smin v23.4s, v23.4s, v29.4s\n"
-    "smin v18.4s, v18.4s, v29.4s\n"
-    "smin v17.4s, v17.4s, v29.4s\n"
-    "smax v16.4s, v16.4s, v30.4s\n"
-    "uzp1 v26.16b, v23.16b, v18.16b\n"
-    "smin v16.4s, v16.4s, v29.4s\n"
-    "uzp1 v24.16b, v17.16b, v16.16b\n"
-    "uzp1 v16.16b, v26.16b, v24.16b\n"
-    "str q16, [%x[outptr], x28]\n"
-    "add x28, x28, #0x10\n"
+    "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v19.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v19.4s\n"
+    "srshl v22.4s, v22.4s, v17.4s\n"
+    "srshl v21.4s, v21.4s, v17.4s\n"
+    "srshl v20.4s, v20.4s, v17.4s\n"
+    "srshl v18.4s, v18.4s, v17.4s\n"
+    "add v22.4s, v22.4s, v16.4s\n"
+    "add v21.4s, v21.4s, v16.4s\n"
+    "add v20.4s, v20.4s, v16.4s\n"
+    "add v18.4s, v18.4s, v16.4s\n"
+    "movi v16.4s, #0x0\n"
+    "smax v22.4s, v22.4s, v16.4s\n"
+    "smax v21.4s, v21.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v22.4s, v22.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v16.4s\n"
+    "smin v20.4s, v20.4s, v16.4s\n"
+    "smin v18.4s, v18.4s, v16.4s\n"
+    "uzp1 v17.16b, v22.16b, v21.16b\n"
+    "uzp1 v16.16b, v20.16b, v18.16b\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 43f\n"
     "14:"  // Oddments
-    "movi v4.16b, #0x0\n"
-    "add %x[outptr], %x[outptr], x28\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 24f\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "add %x[outptr], %x[outptr], x27\n"
+    "movi v8.16b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 24f\n"
     "15:"  // Oddments: 4 inputs loop
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "add x21, x21, x27\n"
+    "movi v4.16b, #0x0\n"
     "movi v3.16b, #0x0\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "add x23, x23, x28\n"
-    "movi v2.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "movi v1.16b, #0x0\n"
-    "add x19, x19, #0x20\n"
-    "movi v0.16b, #0x0\n"
-    "add x22, x22, x28\n"
-    "add x21, x21, x28\n"
-    "add x20, x20, x28\n"
+    "add x20, x20, x27\n"
+    "movi v28.16b, #0x0\n"
+    "movi v22.16b, #0x0\n"
     "tbz %x[n_channels], #3, 19f\n"
-    "ldr d3, [x23], #0x8\n"
-    "ldr d2, [x22], #0x8\n"
-    "ldr d1, [x21], #0x8\n"
-    "ldr d0, [x20], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
+    "ldr d3, [x22], #0x8\n"
+    "ldr d28, [x21], #0x8\n"
+    "ldr d22, [x20], #0x8\n"
     "tbz %x[n_channels], #2, 17f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
-    "ld1 { v2.s }[2], [x22], #0x4\n"
-    "ld1 { v1.s }[2], [x21], #0x4\n"
-    "ld1 { v0.s }[2], [x20], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
+    "ld1 { v3.s }[2], [x22], #0x4\n"
+    "ld1 { v28.s }[2], [x21], #0x4\n"
+    "ld1 { v22.s }[2], [x20], #0x4\n"
     "tbz %x[n_channels], #1, 16f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
-    "ld1 { v2.h }[6], [x22], #0x2\n"
-    "ld1 { v1.h }[6], [x21], #0x2\n"
-    "ld1 { v0.h }[6], [x20], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
+    "ld1 { v3.h }[6], [x22], #0x2\n"
+    "ld1 { v28.h }[6], [x21], #0x2\n"
+    "ld1 { v22.h }[6], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[14], [x23], #0x1\n"
-    "ld1 { v2.b }[14], [x22], #0x1\n"
-    "ld1 { v1.b }[14], [x21], #0x1\n"
-    "ld1 { v0.b }[14], [x20], #0x1\n"
+    "ld1 { v4.b }[14], [x23], #0x1\n"
+    "ld1 { v3.b }[14], [x22], #0x1\n"
+    "ld1 { v28.b }[14], [x21], #0x1\n"
+    "ld1 { v22.b }[14], [x20], #0x1\n"
     "b 23f\n"
     "16:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[12], [x23], #0x1\n"
-    "ld1 { v2.b }[12], [x22], #0x1\n"
-    "ld1 { v1.b }[12], [x21], #0x1\n"
-    "ld1 { v0.b }[12], [x20], #0x1\n"
+    "ld1 { v4.b }[12], [x23], #0x1\n"
+    "ld1 { v3.b }[12], [x22], #0x1\n"
+    "ld1 { v28.b }[12], [x21], #0x1\n"
+    "ld1 { v22.b }[12], [x20], #0x1\n"
     "b 23f\n"
     "17:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 18f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
-    "ld1 { v2.h }[4], [x22], #0x2\n"
-    "ld1 { v1.h }[4], [x21], #0x2\n"
-    "ld1 { v0.h }[4], [x20], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
+    "ld1 { v3.h }[4], [x22], #0x2\n"
+    "ld1 { v28.h }[4], [x21], #0x2\n"
+    "ld1 { v22.h }[4], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[10], [x23], #0x1\n"
-    "ld1 { v2.b }[10], [x22], #0x1\n"
-    "ld1 { v1.b }[10], [x21], #0x1\n"
-    "ld1 { v0.b }[10], [x20], #0x1\n"
+    "ld1 { v4.b }[10], [x23], #0x1\n"
+    "ld1 { v3.b }[10], [x22], #0x1\n"
+    "ld1 { v28.b }[10], [x21], #0x1\n"
+    "ld1 { v22.b }[10], [x20], #0x1\n"
     "b 23f\n"
     "18:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[8], [x23], #0x1\n"
-    "ld1 { v2.b }[8], [x22], #0x1\n"
-    "ld1 { v1.b }[8], [x21], #0x1\n"
-    "ld1 { v0.b }[8], [x20], #0x1\n"
+    "ld1 { v4.b }[8], [x23], #0x1\n"
+    "ld1 { v3.b }[8], [x22], #0x1\n"
+    "ld1 { v28.b }[8], [x21], #0x1\n"
+    "ld1 { v22.b }[8], [x20], #0x1\n"
     "b 23f\n"
     "19:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset
     "tbz %x[n_channels], #2, 21f\n"
-    "ldr s3, [x23], #0x4\n"
-    "ldr s2, [x22], #0x4\n"
-    "ldr s1, [x21], #0x4\n"
-    "ldr s0, [x20], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
+    "ldr s3, [x22], #0x4\n"
+    "ldr s28, [x21], #0x4\n"
+    "ldr s22, [x20], #0x4\n"
     "tbz %x[n_channels], #1, 20f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
-    "ld1 { v2.h }[2], [x22], #0x2\n"
-    "ld1 { v1.h }[2], [x21], #0x2\n"
-    "ld1 { v0.h }[2], [x20], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v3.h }[2], [x22], #0x2\n"
+    "ld1 { v28.h }[2], [x21], #0x2\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[6], [x23], #0x1\n"
-    "ld1 { v2.b }[6], [x22], #0x1\n"
-    "ld1 { v1.b }[6], [x21], #0x1\n"
-    "ld1 { v0.b }[6], [x20], #0x1\n"
+    "ld1 { v4.b }[6], [x23], #0x1\n"
+    "ld1 { v3.b }[6], [x22], #0x1\n"
+    "ld1 { v28.b }[6], [x21], #0x1\n"
+    "ld1 { v22.b }[6], [x20], #0x1\n"
     "b 23f\n"
     "20:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[4], [x23], #0x1\n"
-    "ld1 { v2.b }[4], [x22], #0x1\n"
-    "ld1 { v1.b }[4], [x21], #0x1\n"
-    "ld1 { v0.b }[4], [x20], #0x1\n"
+    "ld1 { v4.b }[4], [x23], #0x1\n"
+    "ld1 { v3.b }[4], [x22], #0x1\n"
+    "ld1 { v28.b }[4], [x21], #0x1\n"
+    "ld1 { v22.b }[4], [x20], #0x1\n"
     "b 23f\n"
     "21:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 22f\n"
-    "ldr h3, [x23], #0x2\n"
-    "ldr h2, [x22], #0x2\n"
-    "ldr h1, [x21], #0x2\n"
-    "ldr h0, [x20], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
+    "ldr h3, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "ldr h22, [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[2], [x23], #0x1\n"
-    "ld1 { v2.b }[2], [x22], #0x1\n"
-    "ld1 { v1.b }[2], [x21], #0x1\n"
-    "ld1 { v0.b }[2], [x20], #0x1\n"
+    "ld1 { v4.b }[2], [x23], #0x1\n"
+    "ld1 { v3.b }[2], [x22], #0x1\n"
+    "ld1 { v28.b }[2], [x21], #0x1\n"
+    "ld1 { v22.b }[2], [x20], #0x1\n"
     "b 23f\n"
     "22:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ldr b3, [x23], #0x1\n"
-    "ldr b2, [x22], #0x1\n"
-    "ldr b1, [x21], #0x1\n"
-    "ldr b0, [x20], #0x1\n"
+    "ldr b4, [x23], #0x1\n"
+    "ldr b3, [x22], #0x1\n"
+    "ldr b28, [x21], #0x1\n"
+    "ldr b22, [x20], #0x1\n"
     "23:"  // Oddments: 4 inputs loop: Load: Bit 3: End
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "subs x24, x24, #0x1\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "umax v19.16b, v23.16b, v19.16b\n"
-    "umax v4.16b, v4.16b, v19.16b\n"
+    "umax v17.16b, v4.16b, v3.16b\n"
+    "umax v16.16b, v28.16b, v22.16b\n"
+    "subs x25, x25, #0x1\n"
+    "umax v16.16b, v17.16b, v16.16b\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
     "bgt 15b\n"
     "24:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 34f\n"
     "25:"  // Oddments: Single input loop
-    "movi v3.16b, #0x0\n"
-    "ldr x23, [x19], #0x8\n"
-    "add x23, x23, x28\n"
+    "ldr x23, [x24], #0x8\n"
+    "add x23, x23, x27\n"
+    "movi v4.16b, #0x0\n"
     "tbz %x[n_channels], #3, 29f\n"
-    "ldr d3, [x23], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
     "tbz %x[n_channels], #2, 27f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
     "tbz %x[n_channels], #1, 26f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[14], [x23], #0x1\n"
+    "ld1 { v4.b }[14], [x23], #0x1\n"
     "b 33f\n"
     "26:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[12], [x23], #0x1\n"
+    "ld1 { v4.b }[12], [x23], #0x1\n"
     "b 33f\n"
     "27:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 28f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[10], [x23], #0x1\n"
+    "ld1 { v4.b }[10], [x23], #0x1\n"
     "b 33f\n"
     "28:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[8], [x23], #0x1\n"
+    "ld1 { v4.b }[8], [x23], #0x1\n"
     "b 33f\n"
     "29:"  // Oddments: Single input loop: Load: Bit 3: Unset
     "tbz %x[n_channels], #2, 31f\n"
-    "ldr s3, [x23], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
     "tbz %x[n_channels], #1, 30f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[6], [x23], #0x1\n"
+    "ld1 { v4.b }[6], [x23], #0x1\n"
     "b 33f\n"
     "30:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[4], [x23], #0x1\n"
+    "ld1 { v4.b }[4], [x23], #0x1\n"
     "b 33f\n"
     "31:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 32f\n"
-    "ldr h3, [x23], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[2], [x23], #0x1\n"
+    "ld1 { v4.b }[2], [x23], #0x1\n"
     "b 33f\n"
     "32:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ldr b3, [x23], #0x1\n"
+    "ldr b4, [x23], #0x1\n"
     "33:"  // Oddments: Single input loop: Load: Bit 3: End
-    "umax v4.16b, v4.16b, v3.16b\n"
-    "subs x20, x20, #0x1\n"
+    "subs x21, x21, #0x1\n"
+    "umax v8.16b, v8.16b, v4.16b\n"
     "bgt 25b\n"
     "34:"  // Oddments: Single input loop: End
-    "uxtl v17.8h, v4.8b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_input_offset]\n"
-    "ld1r { v5.4s }, [x19]\n"
-    "uxtl2 v16.8h, v4.16b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    "movi v30.4s, #0x0\n"
-    "ld1r { v4.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
-    "movi v29.4s, #0xff\n"
-    "ld1r { v3.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    "neg v5.4s, v5.4s\n"
-    "ld1r { v2.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    "saddw v23.4s, v5.4s, v17.4h\n"
-    "ld1r { v1.4s }, [x19]\n"
-    "saddw2 v18.4s, v5.4s, v17.8h\n"
-    "saddw v17.4s, v5.4s, v16.4h\n"
-    "saddw2 v16.4s, v5.4s, v16.8h\n"
-    "srshl v23.4s, v23.4s, v3.4s\n"
-    "srshl v18.4s, v18.4s, v3.4s\n"
-    "srshl v17.4s, v17.4s, v3.4s\n"
-    "srshl v16.4s, v16.4s, v3.4s\n"
-    "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-    "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-    "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-    "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-    "srshl v23.4s, v23.4s, v2.4s\n"
-    "srshl v18.4s, v18.4s, v2.4s\n"
-    "srshl v17.4s, v17.4s, v2.4s\n"
-    "srshl v16.4s, v16.4s, v2.4s\n"
-    "add v23.4s, v23.4s, v1.4s\n"
-    "add v18.4s, v18.4s, v1.4s\n"
-    "add v17.4s, v17.4s, v1.4s\n"
-    "add v16.4s, v16.4s, v1.4s\n"
-    "smax v23.4s, v23.4s, v30.4s\n"
-    "smax v18.4s, v18.4s, v30.4s\n"
-    "smax v17.4s, v17.4s, v30.4s\n"
-    "smin v23.4s, v23.4s, v29.4s\n"
-    "smin v18.4s, v18.4s, v29.4s\n"
-    "smin v17.4s, v17.4s, v29.4s\n"
-    "smax v16.4s, v16.4s, v30.4s\n"
-    "uzp1 v26.16b, v23.16b, v18.16b\n"
-    "smin v16.4s, v16.4s, v29.4s\n"
-    "uzp1 v24.16b, v17.16b, v16.16b\n"
-    "uzp1 v16.16b, v26.16b, v24.16b\n"
+    "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+    "ld1r { v18.4s }, [x20]\n"
+    "uxtl v17.8h, v8.8b\n"
+    "uxtl2 v16.8h, v8.16b\n"
+    "neg v18.4s, v18.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1r { v23.4s }, [x20]\n"
+    "saddw v22.4s, v18.4s, v17.4h\n"
+    "saddw2 v21.4s, v18.4s, v17.8h\n"
+    "saddw v20.4s, v18.4s, v16.4h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1r { v19.4s }, [x20]\n"
+    "saddw2 v18.4s, v18.4s, v16.8h\n"
+    "srshl v22.4s, v22.4s, v23.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1r { v17.4s }, [x20]\n"
+    "srshl v21.4s, v21.4s, v23.4s\n"
+    "srshl v20.4s, v20.4s, v23.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "srshl v18.4s, v18.4s, v23.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v19.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v19.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v19.4s\n"
+    "srshl v22.4s, v22.4s, v17.4s\n"
+    "srshl v21.4s, v21.4s, v17.4s\n"
+    "srshl v20.4s, v20.4s, v17.4s\n"
+    "srshl v18.4s, v18.4s, v17.4s\n"
+    "add v22.4s, v22.4s, v16.4s\n"
+    "add v21.4s, v21.4s, v16.4s\n"
+    "add v20.4s, v20.4s, v16.4s\n"
+    "add v18.4s, v18.4s, v16.4s\n"
+    "movi v16.4s, #0x0\n"
+    "smax v22.4s, v22.4s, v16.4s\n"
+    "smax v21.4s, v21.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v22.4s, v22.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v16.4s\n"
+    "smin v20.4s, v20.4s, v16.4s\n"
+    "smin v18.4s, v18.4s, v16.4s\n"
+    "uzp1 v17.16b, v22.16b, v21.16b\n"
+    "uzp1 v16.16b, v20.16b, v18.16b\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
     "tbz %x[n_channels], #3, 38f\n"
     "st1 { v16.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #2, 36f\n"
@@ -664,12 +664,10 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 42f\n"
     "st1 { v16.b }[0], [%x[outptr]], #0x1\n"
     "42:"  // Oddments: Store: Bit 3: End
-
     "43:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_input_offset] "I" (offsetof(Requantize32, input_offset)), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [quant_params] "r" (&qp)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp
index 6dffdcf01c..225f1e42c9 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,18 +33,11 @@ template <typename T>
 void cpp_nhwc_1x1_stride_any_depthfirst_impl(const uint64_t, const uint64_t, uint64_t n_channels, const T *const *const inptrs, T *outptr);
 
 template <typename T>
-struct cpp_nhwc_1x1_stride_any_depthfirst
+struct cpp_nhwc_1x1_stride_any_depthfirst : IGenericDepthfirstStrategy<T, T, Nothing>
 {
-  typedef T operand_type;
-  typedef T return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t, uint64_t n_channels, const operand_type *const *const inptrs, return_type *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-  kern_type kernel = cpp_nhwc_1x1_stride_any_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<T, T, Nothing>;
   cpp_nhwc_1x1_stride_any_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return cpp_nhwc_1x1_stride_any_depthfirst_impl<T>; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp
index 2bb22131f7..1f8f863de2 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,10 @@
 
 #include <cstdint>
 #include <cstring>
+#ifdef ARM_COMPUTE_ENABLE_BF16
+#include "bfloat.hpp"
+using arm_gemm::bfloat16;
+#endif
 
 namespace arm_conv {
 namespace pooling {
@@ -41,9 +45,15 @@ void cpp_nhwc_1x1_stride_any_depthfirst_impl(
 }
 
 template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const float *const *, float *);
-#if defined(__ARM_FP16_ARGS)
+
+#ifdef __ARM_FP16_ARGS
 template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const __fp16 *const *, __fp16 *);
-#endif  // defined(__ARM_FP16_ARGS)
+#endif
+
+#ifdef ARM_COMPUTE_ENABLE_BF16
+template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const bfloat16 *const *, bfloat16 *);
+#endif
+
 template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const int8_t *const *, int8_t *);
 template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const uint8_t *const *, uint8_t *);
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..f6682e75e2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
+{
+  using Parent = DepthfirstStrategy<__fp16, __fp16>;
+
+  const static auto pooling_type = PoolingType::AVERAGE;
+  const static auto pool_rows = 3u, pool_cols = 3u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
+
+  sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+  Parent::KernelType get_kernel(void) const { return sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..67b07205cd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
+  const unsigned int n_channels,
+  const __fp16 *const *const inptrs,
+  __fp16 *const *const outptrs,
+  const bool exclude_padding,
+  const unsigned int pad_left,
+  const unsigned int pad_top,
+  const unsigned int pad_right,
+  const unsigned int pad_bottom
+)
+{
+  struct KernelArgs
+  {
+    const uint64_t n_channels;
+    const __fp16 *const *const inptrs;
+    __fp16 *const *const outptrs;
+    __fp16 rescale_vals[4];
+
+    KernelArgs(
+      unsigned int channels,
+      const __fp16 *const *input_ptrs,
+      __fp16 *const * output_ptrs,
+      bool exclude_padding, unsigned int pad_left, unsigned int pad_top, unsigned int pad_right, unsigned int pad_bottom
+    ) : n_channels(channels),
+        inptrs(input_ptrs),
+        outptrs(output_ptrs)
+    {
+      for (unsigned int i = 0; i < 2; i++)
+      {
+        const int start_i = 1*i - static_cast<int>(pad_top);
+        const int end_i = std::min<int>(start_i + 3, 4 - pad_top - pad_bottom);
+        const int valid_rows = end_i - std::max<int>(0, start_i);
+
+        for (unsigned int j = 0; j < 2; j++)
+        {
+          const int start_j = 1*j - static_cast<int>(pad_left);
+          const int end_j = std::min<int>(start_j + 3, 4 - pad_left - pad_right);
+          const int valid_cols = end_j - std::max<int>(0, start_j);
+
+          rescale_vals[i*2 + j] = static_cast<__fp16>(1.0f / static_cast<float>(
+            exclude_padding ? valid_rows * valid_cols : 9
+          ));
+        }
+      }
+    }
+  };
+
+  const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+                        pad_left, pad_top, pad_right, pad_bottom);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x3, #0x0\n"
+    "mov x20, #0x4\n"
+    "ldr x4, [%x[args], %[offsetof_inptrs]]\n"
+    "whilelt p0.h, XZR, x20\n"
+    "add x20, %x[args], %[offsetof_rescale]\n"
+    "ld1rqh { z4.h }, p0/Z, [x20]\n"
+    "ldr x5, [%x[args], %[offsetof_n_channels]]\n"
+    "whilelt p0.h, x3, x5\n"
+    "mov x6, #0x0\n"
+    "ldp x7, x8, [x21, #0x0]\n"
+    "ldp x17, x16, [x21, #0x10]\n"
+    "ldp x15, x14, [x4, #0x0]\n"
+    "ld1h { z3.h }, p0/Z, [x14, x3, LSL #1]\n"
+    "ldp x13, x12, [x4, #0x10]\n"
+    "ld1h { z2.h }, p0/Z, [x13, x3, LSL #1]\n"
+    "ldp x11, x10, [x4, #0x20]\n"
+    "ld1h { z1.h }, p0/Z, [x10, x3, LSL #1]\n"
+    "ldp x9, x28, [x4, #0x30]\n"
+    "ld1h { z0.h }, p0/Z, [x9, x3, LSL #1]\n"
+    "ldp x27, x26, [x4, #0x40]\n"
+    "ld1h { z31.h }, p0/Z, [x26, x3, LSL #1]\n"
+    "ldp x25, x24, [x4, #0x50]\n"
+    "ld1h { z30.h }, p0/Z, [x25, x3, LSL #1]\n"
+    "ldp x23, x22, [x4, #0x60]\n"
+    "ld1h { z29.h }, p0/Z, [x11, x3, LSL #1]\n"
+    "ldp x21, x20, [x4, #0x70]\n"
+    "ld1h { z28.h }, p0/Z, [x27, x3, LSL #1]\n"
+    "ld1h { z27.h }, p0/Z, [x28, x3, LSL #1]\n"
+    "ld1h { z22.h }, p0/Z, [x24, x3, LSL #1]\n"
+    "ld1h { z21.h }, p0/Z, [x22, x3, LSL #1]\n"
+    "ld1h { z20.h }, p0/Z, [x21, x3, LSL #1]\n"
+    "ld1h { z26.h }, p0/Z, [x15, x3, LSL #1]\n"
+    "ld1h { z25.h }, p0/Z, [x12, x3, LSL #1]\n"
+    "ld1h { z24.h }, p0/Z, [x23, x3, LSL #1]\n"
+    "ld1h { z23.h }, p0/Z, [x20, x3, LSL #1]\n"
+    "incw x3\n"
+    "whilelt p1.h, x3, x5\n"
+    "b.none 2f\n"
+    "1:"  // Vector: Loop
+    "fadd z17.h, z1.h, z0.h\n"
+    "fadd z16.h, z31.h, z30.h\n"
+    "ld1h { z1.h }, p1/Z, [x10, x3, LSL #1]\n"
+    "whilelt p0.h, x6, x5\n"
+    "fadd z19.h, z17.h, z16.h\n"
+    "fadd z18.h, z3.h, z2.h\n"
+    "ld1h { z0.h }, p1/Z, [x9, x3, LSL #1]\n"
+    "fadd z17.h, z29.h, z28.h\n"
+    "fadd z22.h, z27.h, z22.h\n"
+    "ld1h { z31.h }, p1/Z, [x26, x3, LSL #1]\n"
+    "fadd z16.h, z21.h, z20.h\n"
+    "fadd z21.h, z18.h, z19.h\n"
+    "ld1h { z30.h }, p1/Z, [x25, x3, LSL #1]\n"
+    "fadd z20.h, z16.h, z19.h\n"
+    "fadd z19.h, z26.h, z17.h\n"
+    "ld1h { z3.h }, p1/Z, [x14, x3, LSL #1]\n"
+    "fadd z18.h, z25.h, z22.h\n"
+    "fadd z17.h, z24.h, z17.h\n"
+    "ld1h { z2.h }, p1/Z, [x13, x3, LSL #1]\n"
+    "fadd z16.h, z23.h, z22.h\n"
+    "fadd z19.h, z21.h, z19.h\n"
+    "ld1h { z29.h }, p1/Z, [x11, x3, LSL #1]\n"
+    "fadd z18.h, z21.h, z18.h\n"
+    "fadd z17.h, z17.h, z20.h\n"
+    "ld1h { z28.h }, p1/Z, [x27, x3, LSL #1]\n"
+    "fadd z16.h, z16.h, z20.h\n"
+    "ld1h { z27.h }, p1/Z, [x28, x3, LSL #1]\n"
+    "fmul z19.h, z19.h, z4.h[0]\n"
+    "ld1h { z22.h }, p1/Z, [x24, x3, LSL #1]\n"
+    "fmul z18.h, z18.h, z4.h[1]\n"
+    "fmul z17.h, z17.h, z4.h[2]\n"
+    "ld1h { z21.h }, p1/Z, [x22, x3, LSL #1]\n"
+    "fmul z16.h, z16.h, z4.h[3]\n"
+    "st1h { z19.h }, p0, [x7, x6, LSL #1]\n"
+    "ld1h { z20.h }, p1/Z, [x21, x3, LSL #1]\n"
+    "st1h { z18.h }, p0, [x8, x6, LSL #1]\n"
+    "ld1h { z26.h }, p1/Z, [x15, x3, LSL #1]\n"
+    "st1h { z17.h }, p0, [x17, x6, LSL #1]\n"
+    "ld1h { z25.h }, p1/Z, [x12, x3, LSL #1]\n"
+    "st1h { z16.h }, p0, [x16, x6, LSL #1]\n"
+    "incw x6\n"
+    "ld1h { z24.h }, p1/Z, [x23, x3, LSL #1]\n"
+    "ld1h { z23.h }, p1/Z, [x20, x3, LSL #1]\n"
+    "incw x3\n"
+    "whilelt p1.h, x3, x5\n"
+    "b.any 1b\n"
+    "2:"  // Vector: Tail
+    "fadd z17.h, z1.h, z0.h\n"
+    "fadd z16.h, z31.h, z30.h\n"
+    "whilelt p0.h, x6, x5\n"
+    "fadd z19.h, z17.h, z16.h\n"
+    "fadd z18.h, z3.h, z2.h\n"
+    "fadd z17.h, z29.h, z28.h\n"
+    "fadd z22.h, z27.h, z22.h\n"
+    "fadd z16.h, z21.h, z20.h\n"
+    "fadd z21.h, z18.h, z19.h\n"
+    "fadd z20.h, z16.h, z19.h\n"
+    "fadd z19.h, z26.h, z17.h\n"
+    "fadd z18.h, z25.h, z22.h\n"
+    "fadd z17.h, z24.h, z17.h\n"
+    "fadd z16.h, z23.h, z22.h\n"
+    "fadd z19.h, z21.h, z19.h\n"
+    "fadd z18.h, z21.h, z18.h\n"
+    "fadd z17.h, z17.h, z20.h\n"
+    "fadd z16.h, z16.h, z20.h\n"
+    "fmul z19.h, z19.h, z4.h[0]\n"
+    "st1h { z19.h }, p0, [x7, x6, LSL #1]\n"
+    "fmul z18.h, z18.h, z4.h[1]\n"
+    "fmul z17.h, z17.h, z4.h[2]\n"
+    "st1h { z18.h }, p0, [x8, x6, LSL #1]\n"
+    "fmul z16.h, z16.h, z4.h[3]\n"
+    "st1h { z17.h }, p0, [x17, x6, LSL #1]\n"
+    "st1h { z16.h }, p0, [x16, x6, LSL #1]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..cf09f421c4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
+
+struct sme_fp16_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
+{
+  using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
+  sme_fp16_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_fp16_nhwc_avg_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..60f17b7bc2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_fp16_nhwc_avg_generic_depthfirst_impl(
+  const uint64_t window_cells,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const __fp16 *const *const inptrs,
+  __fp16 *outptr
+)
+{
+  const auto rescale_value = static_cast<__fp16>(1.0f / static_cast<float>(window_cells));
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x9, #0x0\n"
+    "cnth x28\n"
+    "cnth x27, ALL, MUL #2\n"
+    "cnth x26, ALL, MUL #3\n"
+    "ptrue p0.b\n"
+    "whilelt p3.h, x9, %x[n_channels]\n"
+    "ld1rh { z6.h }, p0/Z, [%x[rescale_ptr]]\n"
+    "whilelt p2.h, x28, %x[n_channels]\n"
+    "whilelt p1.h, x27, %x[n_channels]\n"
+    "whilelt p0.h, x26, %x[n_channels]\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z5.b, #0x0\n"
+    "mov z4.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z3.b, #0x0\n"
+    "mov z2.b, #0x0\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "ld1h { z29.h }, p2/Z, [x23, x28, LSL #1]\n"
+    "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
+    "ld1h { z28.h }, p2/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z18.h }, p2/Z, [x20, x28, LSL #1]\n"
+    "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+    "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+    "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+    "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+    "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 4 inputs loop
+    "fadd z23.h, z1.h, z0.h\n"
+    "fadd z19.h, z31.h, z30.h\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "fadd z22.h, z29.h, z22.h\n"
+    "fadd z18.h, z28.h, z18.h\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "fadd z21.h, z27.h, z21.h\n"
+    "fadd z17.h, z26.h, z17.h\n"
+    "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
+    "fadd z20.h, z25.h, z20.h\n"
+    "fadd z16.h, z24.h, z16.h\n"
+    "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+    "fadd z19.h, z23.h, z19.h\n"
+    "fadd z18.h, z22.h, z18.h\n"
+    "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+    "fadd z17.h, z21.h, z17.h\n"
+    "fadd z16.h, z20.h, z16.h\n"
+    "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "fadd z5.h, z5.h, z19.h\n"
+    "fadd z4.h, z4.h, z18.h\n"
+    "ld1h { z29.h }, p2/Z, [x23, x28, LSL #1]\n"
+    "fadd z3.h, z3.h, z17.h\n"
+    "fadd z2.h, z2.h, z16.h\n"
+    "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
+    "ld1h { z28.h }, p2/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z18.h }, p2/Z, [x20, x28, LSL #1]\n"
+    "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+    "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+    "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+    "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+    "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 4 inputs tail
+    "fadd z23.h, z1.h, z0.h\n"
+    "fadd z19.h, z31.h, z30.h\n"
+    "fadd z22.h, z29.h, z22.h\n"
+    "fadd z18.h, z28.h, z18.h\n"
+    "fadd z21.h, z27.h, z21.h\n"
+    "fadd z17.h, z26.h, z17.h\n"
+    "fadd z20.h, z25.h, z20.h\n"
+    "fadd z16.h, z24.h, z16.h\n"
+    "fadd z19.h, z23.h, z19.h\n"
+    "fadd z18.h, z22.h, z18.h\n"
+    "fadd z17.h, z21.h, z17.h\n"
+    "fadd z16.h, z20.h, z16.h\n"
+    "fadd z5.h, z5.h, z19.h\n"
+    "fadd z4.h, z4.h, z18.h\n"
+    "fadd z3.h, z3.h, z17.h\n"
+    "fadd z2.h, z2.h, z16.h\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd z5.h, z5.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x28, LSL #1]\n"
+    "fadd z4.h, z4.h, z16.h\n"
+    "ld1h { z16.h }, p1/Z, [x20, x27, LSL #1]\n"
+    "fadd z3.h, z3.h, z16.h\n"
+    "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+    "fadd z2.h, z2.h, z16.h\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "fmul z5.h, z5.h, z6.h\n"
+    "fmul z4.h, z4.h, z6.h\n"
+    "st1h { z5.h }, p3, [%x[outptr], x9, LSL #1]\n"
+    "inch x9, ALL, MUL #4\n"
+    "fmul z3.h, z3.h, z6.h\n"
+    "fmul z2.h, z2.h, z6.h\n"
+    "st1h { z4.h }, p2, [%x[outptr], x28, LSL #1]\n"
+    "inch x28, ALL, MUL #4\n"
+    "st1h { z3.h }, p1, [%x[outptr], x27, LSL #1]\n"
+    "inch x27, ALL, MUL #4\n"
+    "st1h { z2.h }, p0, [%x[outptr], x26, LSL #1]\n"
+    "inch x26, ALL, MUL #4\n"
+    "whilelt p0.h, x26, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p3.h, x9, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z5.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x20, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1h { z1.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 4 inputs loop
+    "fadd z17.h, z1.h, z0.h\n"
+    "fadd z16.h, z31.h, z30.h\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "fadd z16.h, z17.h, z16.h\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fadd z5.h, z5.h, z16.h\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 4 inputs tail
+    "fadd z17.h, z1.h, z0.h\n"
+    "fadd z16.h, z31.h, z30.h\n"
+    "fadd z16.h, z17.h, z16.h\n"
+    "fadd z5.h, z5.h, z16.h\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd z5.h, z5.h, z16.h\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "fmul z5.h, z5.h, z6.h\n"
+    "st1h { z5.h }, p3, [%x[outptr], x9, LSL #1]\n"
+    "inch x9\n"
+    "whilelt p3.h, x9, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..cd6c7449a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
+{
+  using Parent = DepthfirstStrategy<__fp16, __fp16>;
+
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
+
+  sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+  Parent::KernelType get_kernel(void) const { return sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..7fc776ed4e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
+  const unsigned int n_channels,
+  const __fp16 *const *const inptrs,
+  __fp16 *const *const outptrs,
+  const bool exclude_padding,
+  const unsigned int pad_left,
+  const unsigned int pad_top,
+  const unsigned int pad_right,
+  const unsigned int pad_bottom
+)
+{
+  struct KernelArgs
+  {
+    const uint64_t n_channels;
+    const __fp16 *const *const inptrs;
+    __fp16 *const *const outptrs;
+    KernelArgs(
+      unsigned int channels,
+      const __fp16 *const *input_ptrs,
+      __fp16 *const * output_ptrs,
+      bool, unsigned int, unsigned int, unsigned int, unsigned int
+    ) : n_channels(channels),
+        inptrs(input_ptrs),
+        outptrs(output_ptrs)
+    {
+    }
+  };
+
+  const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+                        pad_left, pad_top, pad_right, pad_bottom);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x15, #0x0\n"
+    "ptrue p2.b\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "mov x14, #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_n_channels]]\n"
+    "whilelt p0.h, x15, x13\n"
+    "ldp x12, x11, [x21, #0x0]\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "ldp x28, x27, [x20, #0x0]\n"
+    "ld1h { z30.h }, p0/Z, [x27, x15, LSL #1]\n"
+    "ldp x26, x25, [x20, #0x10]\n"
+    "ld1h { z29.h }, p0/Z, [x25, x15, LSL #1]\n"
+    "ldp x24, x23, [x20, #0x20]\n"
+    "ld1h { z28.h }, p0/Z, [x24, x15, LSL #1]\n"
+    "ldp x22, x21, [x20, #0x30]\n"
+    "ld1h { z27.h }, p0/Z, [x21, x15, LSL #1]\n"
+    "ldr x20, [x20, #0x40]\n"
+    "ld1h { z26.h }, p0/Z, [x28, x15, LSL #1]\n"
+    "ld1h { z25.h }, p0/Z, [x26, x15, LSL #1]\n"
+    "ld1h { z24.h }, p0/Z, [x23, x15, LSL #1]\n"
+    "ld1h { z19.h }, p0/Z, [x22, x15, LSL #1]\n"
+    "ld1h { z23.h }, p0/Z, [x20, x15, LSL #1]\n"
+    "incw x15\n"
+    "whilelt p1.h, x15, x13\n"
+    "b.none 2f\n"
+    "1:"  // Vector: Loop
+    "movprfx z22, z30\n fmax z22.h, p2/M, z22.h, z28.h\n"
+    "movprfx z21, z28\n fmax z21.h, p2/M, z21.h, z27.h\n"
+    "ld1h { z30.h }, p1/Z, [x27, x15, LSL #1]\n"
+    "whilelt p0.h, x14, x13\n"
+    "movprfx z18, z29\n fmax z18.h, p2/M, z18.h, z26.h\n"
+    "movprfx z17, z25\n fmax z17.h, p2/M, z17.h, z24.h\n"
+    "ld1h { z28.h }, p1/Z, [x24, x15, LSL #1]\n"
+    "movprfx z16, z29\n fmax z16.h, p2/M, z16.h, z19.h\n"
+    "movprfx z20, z24\n fmax z20.h, p2/M, z20.h, z23.h\n"
+    "ld1h { z27.h }, p1/Z, [x21, x15, LSL #1]\n"
+    "ld1h { z29.h }, p1/Z, [x25, x15, LSL #1]\n"
+    "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z18.h\n"
+    "movprfx z18, z17\n fmax z18.h, p2/M, z18.h, z22.h\n"
+    "ld1h { z26.h }, p1/Z, [x28, x15, LSL #1]\n"
+    "movprfx z17, z16\n fmax z17.h, p2/M, z17.h, z21.h\n"
+    "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z20.h\n"
+    "ld1h { z25.h }, p1/Z, [x26, x15, LSL #1]\n"
+    "st1h { z19.h }, p0, [x12, x14, LSL #1]\n"
+    "ld1h { z24.h }, p1/Z, [x23, x15, LSL #1]\n"
+    "st1h { z18.h }, p0, [x11, x14, LSL #1]\n"
+    "ld1h { z19.h }, p1/Z, [x22, x15, LSL #1]\n"
+    "st1h { z17.h }, p0, [x10, x14, LSL #1]\n"
+    "ld1h { z23.h }, p1/Z, [x20, x15, LSL #1]\n"
+    "incw x15\n"
+    "whilelt p1.h, x15, x13\n"
+    "st1h { z16.h }, p0, [x9, x14, LSL #1]\n"
+    "incw x14\n"
+    "b.any 1b\n"
+    "2:"  // Vector: Tail
+    "movprfx z22, z30\n fmax z22.h, p2/M, z22.h, z28.h\n"
+    "movprfx z21, z28\n fmax z21.h, p2/M, z21.h, z27.h\n"
+    "whilelt p0.h, x14, x13\n"
+    "movprfx z20, z29\n fmax z20.h, p2/M, z20.h, z26.h\n"
+    "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z24.h\n"
+    "movprfx z17, z29\n fmax z17.h, p2/M, z17.h, z19.h\n"
+    "movprfx z19, z24\n fmax z19.h, p2/M, z19.h, z23.h\n"
+    "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
+    "fmax z18.h, p2/M, z18.h, z22.h\n"
+    "st1h { z16.h }, p0, [x12, x14, LSL #1]\n"
+    "fmax z17.h, p2/M, z17.h, z21.h\n"
+    "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z19.h\n"
+    "st1h { z18.h }, p0, [x11, x14, LSL #1]\n"
+    "st1h { z17.h }, p0, [x10, x14, LSL #1]\n"
+    "st1h { z16.h }, p0, [x9, x14, LSL #1]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..bfb3bf5b1a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
+
+struct sme_fp16_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
+{
+  using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
+  sme_fp16_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_fp16_nhwc_max_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..afa2ccbd71
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_fp16_nhwc_max_generic_depthfirst_impl(
+  const uint64_t,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const __fp16 *const *const inptrs,
+  __fp16 *outptr
+)
+{
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x9, #0x0\n"
+    "cnth x28\n"
+    "cnth x27, ALL, MUL #2\n"
+    "cnth x26, ALL, MUL #3\n"
+    "whilelt p4.h, x9, %x[n_channels]\n"
+    "whilelt p3.h, x28, %x[n_channels]\n"
+    "whilelt p2.h, x27, %x[n_channels]\n"
+    "whilelt p1.h, x26, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.h, #0xfc00\n"
+    "mov z3.h, #0xfc00\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z2.h, #0xfc00\n"
+    "mov z1.h, #0xfc00\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "ld1h { z18.h }, p3/Z, [x23, x28, LSL #1]\n"
+    "ld1h { z29.h }, p3/Z, [x22, x28, LSL #1]\n"
+    "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z28.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "ld1h { z17.h }, p2/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z27.h }, p2/Z, [x22, x27, LSL #1]\n"
+    "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z26.h }, p2/Z, [x20, x27, LSL #1]\n"
+    "ld1h { z16.h }, p1/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+    "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+    "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 4 inputs loop
+    "movprfx z19, z0\n fmax z19.h, p0/M, z19.h, z31.h\n"
+    "fmax z23.h, p0/M, z23.h, z30.h\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "fmax z18.h, p0/M, z18.h, z29.h\n"
+    "fmax z22.h, p0/M, z22.h, z28.h\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "fmax z17.h, p0/M, z17.h, z27.h\n"
+    "fmax z21.h, p0/M, z21.h, z26.h\n"
+    "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
+    "fmax z16.h, p0/M, z16.h, z25.h\n"
+    "fmax z20.h, p0/M, z20.h, z24.h\n"
+    "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+    "fmax z19.h, p0/M, z19.h, z23.h\n"
+    "fmax z18.h, p0/M, z18.h, z22.h\n"
+    "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+    "fmax z17.h, p0/M, z17.h, z21.h\n"
+    "fmax z16.h, p0/M, z16.h, z20.h\n"
+    "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "fmax z4.h, p0/M, z4.h, z19.h\n"
+    "fmax z3.h, p0/M, z3.h, z18.h\n"
+    "ld1h { z18.h }, p3/Z, [x23, x28, LSL #1]\n"
+    "fmax z2.h, p0/M, z2.h, z17.h\n"
+    "fmax z1.h, p0/M, z1.h, z16.h\n"
+    "ld1h { z29.h }, p3/Z, [x22, x28, LSL #1]\n"
+    "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z28.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "ld1h { z17.h }, p2/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z27.h }, p2/Z, [x22, x27, LSL #1]\n"
+    "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z26.h }, p2/Z, [x20, x27, LSL #1]\n"
+    "ld1h { z16.h }, p1/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+    "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+    "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 4 inputs tail
+    "movprfx z19, z0\n fmax z19.h, p0/M, z19.h, z31.h\n"
+    "fmax z23.h, p0/M, z23.h, z30.h\n"
+    "fmax z18.h, p0/M, z18.h, z29.h\n"
+    "fmax z22.h, p0/M, z22.h, z28.h\n"
+    "fmax z17.h, p0/M, z17.h, z27.h\n"
+    "fmax z21.h, p0/M, z21.h, z26.h\n"
+    "fmax z16.h, p0/M, z16.h, z25.h\n"
+    "fmax z20.h, p0/M, z20.h, z24.h\n"
+    "fmax z19.h, p0/M, z19.h, z23.h\n"
+    "fmax z18.h, p0/M, z18.h, z22.h\n"
+    "fmax z17.h, p0/M, z17.h, z21.h\n"
+    "fmax z16.h, p0/M, z16.h, z20.h\n"
+    "fmax z4.h, p0/M, z4.h, z19.h\n"
+    "fmax z3.h, p0/M, z3.h, z18.h\n"
+    "fmax z2.h, p0/M, z2.h, z17.h\n"
+    "fmax z1.h, p0/M, z1.h, z16.h\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax z4.h, p0/M, z4.h, z16.h\n"
+    "ld1h { z16.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "fmax z3.h, p0/M, z3.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x27, LSL #1]\n"
+    "fmax z2.h, p0/M, z2.h, z16.h\n"
+    "ld1h { z16.h }, p1/Z, [x20, x26, LSL #1]\n"
+    "fmax z1.h, p0/M, z1.h, z16.h\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "st1h { z4.h }, p4, [%x[outptr], x9, LSL #1]\n"
+    "inch x9, ALL, MUL #4\n"
+    "st1h { z3.h }, p3, [%x[outptr], x28, LSL #1]\n"
+    "inch x28, ALL, MUL #4\n"
+    "st1h { z2.h }, p2, [%x[outptr], x27, LSL #1]\n"
+    "inch x27, ALL, MUL #4\n"
+    "st1h { z1.h }, p1, [%x[outptr], x26, LSL #1]\n"
+    "inch x26, ALL, MUL #4\n"
+    "whilelt p1.h, x26, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.h, x9, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.h, #0xfc00\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x20, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1h { z0.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 4 inputs loop
+    "movprfx z16, z0\n fmax z16.h, p0/M, z16.h, z31.h\n"
+    "movprfx z17, z23\n fmax z17.h, p0/M, z17.h, z30.h\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "fmax z16.h, p0/M, z16.h, z17.h\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fmax z4.h, p0/M, z4.h, z16.h\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 4 inputs tail
+    "movprfx z16, z0\n fmax z16.h, p0/M, z16.h, z31.h\n"
+    "movprfx z17, z23\n fmax z17.h, p0/M, z17.h, z30.h\n"
+    "fmax z16.h, p0/M, z16.h, z17.h\n"
+    "fmax z4.h, p0/M, z4.h, z16.h\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax z4.h, p0/M, z4.h, z16.h\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "st1h { z4.h }, p4, [%x[outptr], x9, LSL #1]\n"
+    "inch x9\n"
+    "whilelt p4.h, x9, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..23a0eee04e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
+{
+  using Parent = DepthfirstStrategy<float, float>;
+
+  const static auto pooling_type = PoolingType::AVERAGE;
+  const static auto pool_rows = 3u, pool_cols = 3u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
+
+  sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+  Parent::KernelType get_kernel(void) const { return sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..8c8532827a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
+  const unsigned int n_channels,
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const bool exclude_padding,
+  const unsigned int pad_left,
+  const unsigned int pad_top,
+  const unsigned int pad_right,
+  const unsigned int pad_bottom
+)
+{
+  struct KernelArgs
+  {
+    const uint64_t n_channels;
+    const float *const *const inptrs;
+    float *const *const outptrs;
+    float rescale_vals[4];
+
+    KernelArgs(
+      unsigned int channels,
+      const float *const *input_ptrs,
+      float *const * output_ptrs,
+      bool exclude_padding, unsigned int pad_left, unsigned int pad_top, unsigned int pad_right, unsigned int pad_bottom
+    ) : n_channels(channels),
+        inptrs(input_ptrs),
+        outptrs(output_ptrs)
+    {
+      for (unsigned int i = 0; i < 2; i++)
+      {
+        const int start_i = 1*i - static_cast<int>(pad_top);
+        const int end_i = std::min<int>(start_i + 3, 4 - pad_top - pad_bottom);
+        const int valid_rows = end_i - std::max<int>(0, start_i);
+
+        for (unsigned int j = 0; j < 2; j++)
+        {
+          const int start_j = 1*j - static_cast<int>(pad_left);
+          const int end_j = std::min<int>(start_j + 3, 4 - pad_left - pad_right);
+          const int valid_cols = end_j - std::max<int>(0, start_j);
+
+          rescale_vals[i*2 + j] = static_cast<float>(1.0f / static_cast<float>(
+            exclude_padding ? valid_rows * valid_cols : 9
+          ));
+        }
+      }
+    }
+  };
+
+  const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+                        pad_left, pad_top, pad_right, pad_bottom);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x3, #0x0\n"
+    "mov x20, #0x4\n"
+    "ldr x4, [%x[args], %[offsetof_inptrs]]\n"
+    "whilelt p0.s, XZR, x20\n"
+    "add x20, %x[args], %[offsetof_rescale]\n"
+    "ld1rqw { z4.s }, p0/Z, [x20]\n"
+    "ldr x5, [%x[args], %[offsetof_n_channels]]\n"
+    "whilelt p0.s, x3, x5\n"
+    "mov x6, #0x0\n"
+    "ldp x7, x8, [x21, #0x0]\n"
+    "ldp x17, x16, [x21, #0x10]\n"
+    "ldp x15, x14, [x4, #0x0]\n"
+    "ld1w { z3.s }, p0/Z, [x14, x3, LSL #2]\n"
+    "ldp x13, x12, [x4, #0x10]\n"
+    "ld1w { z2.s }, p0/Z, [x13, x3, LSL #2]\n"
+    "ldp x11, x10, [x4, #0x20]\n"
+    "ld1w { z1.s }, p0/Z, [x10, x3, LSL #2]\n"
+    "ldp x9, x28, [x4, #0x30]\n"
+    "ld1w { z0.s }, p0/Z, [x9, x3, LSL #2]\n"
+    "ldp x27, x26, [x4, #0x40]\n"
+    "ld1w { z31.s }, p0/Z, [x26, x3, LSL #2]\n"
+    "ldp x25, x24, [x4, #0x50]\n"
+    "ld1w { z30.s }, p0/Z, [x25, x3, LSL #2]\n"
+    "ldp x23, x22, [x4, #0x60]\n"
+    "ld1w { z29.s }, p0/Z, [x11, x3, LSL #2]\n"
+    "ldp x21, x20, [x4, #0x70]\n"
+    "ld1w { z28.s }, p0/Z, [x27, x3, LSL #2]\n"
+    "ld1w { z27.s }, p0/Z, [x28, x3, LSL #2]\n"
+    "ld1w { z22.s }, p0/Z, [x24, x3, LSL #2]\n"
+    "ld1w { z21.s }, p0/Z, [x22, x3, LSL #2]\n"
+    "ld1w { z20.s }, p0/Z, [x21, x3, LSL #2]\n"
+    "ld1w { z26.s }, p0/Z, [x15, x3, LSL #2]\n"
+    "ld1w { z25.s }, p0/Z, [x12, x3, LSL #2]\n"
+    "ld1w { z24.s }, p0/Z, [x23, x3, LSL #2]\n"
+    "ld1w { z23.s }, p0/Z, [x20, x3, LSL #2]\n"
+    "incw x3\n"
+    "whilelt p1.s, x3, x5\n"
+    "b.none 2f\n"
+    "1:"  // Vector: Loop
+    "fadd z17.s, z1.s, z0.s\n"
+    "fadd z16.s, z31.s, z30.s\n"
+    "ld1w { z1.s }, p1/Z, [x10, x3, LSL #2]\n"
+    "whilelt p0.s, x6, x5\n"
+    "fadd z19.s, z17.s, z16.s\n"
+    "fadd z18.s, z3.s, z2.s\n"
+    "ld1w { z0.s }, p1/Z, [x9, x3, LSL #2]\n"
+    "fadd z17.s, z29.s, z28.s\n"
+    "fadd z22.s, z27.s, z22.s\n"
+    "ld1w { z31.s }, p1/Z, [x26, x3, LSL #2]\n"
+    "fadd z16.s, z21.s, z20.s\n"
+    "fadd z21.s, z18.s, z19.s\n"
+    "ld1w { z30.s }, p1/Z, [x25, x3, LSL #2]\n"
+    "fadd z20.s, z16.s, z19.s\n"
+    "fadd z19.s, z26.s, z17.s\n"
+    "ld1w { z3.s }, p1/Z, [x14, x3, LSL #2]\n"
+    "fadd z18.s, z25.s, z22.s\n"
+    "fadd z17.s, z24.s, z17.s\n"
+    "ld1w { z2.s }, p1/Z, [x13, x3, LSL #2]\n"
+    "fadd z16.s, z23.s, z22.s\n"
+    "fadd z19.s, z21.s, z19.s\n"
+    "ld1w { z29.s }, p1/Z, [x11, x3, LSL #2]\n"
+    "fadd z18.s, z21.s, z18.s\n"
+    "fadd z17.s, z17.s, z20.s\n"
+    "ld1w { z28.s }, p1/Z, [x27, x3, LSL #2]\n"
+    "fadd z16.s, z16.s, z20.s\n"
+    "ld1w { z27.s }, p1/Z, [x28, x3, LSL #2]\n"
+    "fmul z19.s, z19.s, z4.s[0]\n"
+    "ld1w { z22.s }, p1/Z, [x24, x3, LSL #2]\n"
+    "fmul z18.s, z18.s, z4.s[1]\n"
+    "fmul z17.s, z17.s, z4.s[2]\n"
+    "ld1w { z21.s }, p1/Z, [x22, x3, LSL #2]\n"
+    "fmul z16.s, z16.s, z4.s[3]\n"
+    "st1w { z19.s }, p0, [x7, x6, LSL #2]\n"
+    "ld1w { z20.s }, p1/Z, [x21, x3, LSL #2]\n"
+    "st1w { z18.s }, p0, [x8, x6, LSL #2]\n"
+    "ld1w { z26.s }, p1/Z, [x15, x3, LSL #2]\n"
+    "st1w { z17.s }, p0, [x17, x6, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [x12, x3, LSL #2]\n"
+    "st1w { z16.s }, p0, [x16, x6, LSL #2]\n"
+    "incw x6\n"
+    "ld1w { z24.s }, p1/Z, [x23, x3, LSL #2]\n"
+    "ld1w { z23.s }, p1/Z, [x20, x3, LSL #2]\n"
+    "incw x3\n"
+    "whilelt p1.s, x3, x5\n"
+    "b.any 1b\n"
+    "2:"  // Vector: Tail
+    "fadd z17.s, z1.s, z0.s\n"
+    "fadd z16.s, z31.s, z30.s\n"
+    "whilelt p0.s, x6, x5\n"
+    "fadd z19.s, z17.s, z16.s\n"
+    "fadd z18.s, z3.s, z2.s\n"
+    "fadd z17.s, z29.s, z28.s\n"
+    "fadd z22.s, z27.s, z22.s\n"
+    "fadd z16.s, z21.s, z20.s\n"
+    "fadd z21.s, z18.s, z19.s\n"
+    "fadd z20.s, z16.s, z19.s\n"
+    "fadd z19.s, z26.s, z17.s\n"
+    "fadd z18.s, z25.s, z22.s\n"
+    "fadd z17.s, z24.s, z17.s\n"
+    "fadd z16.s, z23.s, z22.s\n"
+    "fadd z19.s, z21.s, z19.s\n"
+    "fadd z18.s, z21.s, z18.s\n"
+    "fadd z17.s, z17.s, z20.s\n"
+    "fadd z16.s, z16.s, z20.s\n"
+    "fmul z19.s, z19.s, z4.s[0]\n"
+    "st1w { z19.s }, p0, [x7, x6, LSL #2]\n"
+    "fmul z18.s, z18.s, z4.s[1]\n"
+    "fmul z17.s, z17.s, z4.s[2]\n"
+    "st1w { z18.s }, p0, [x8, x6, LSL #2]\n"
+    "fmul z16.s, z16.s, z4.s[3]\n"
+    "st1w { z17.s }, p0, [x17, x6, LSL #2]\n"
+    "st1w { z16.s }, p0, [x16, x6, LSL #2]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..29bcfc5a3b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
+
+struct sme_fp32_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
+{
+  using Parent = IGenericDepthfirstStrategy<float, float>;
+  sme_fp32_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_fp32_nhwc_avg_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..86e7f84542
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_fp32_nhwc_avg_generic_depthfirst_impl(
+  const uint64_t window_cells,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const float *const *const inptrs,
+  float *outptr
+)
+{
+  const auto rescale_value = static_cast<float>(1.0f / static_cast<float>(window_cells));
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x9, #0x0\n"
+    "cntw x28\n"
+    "cntw x27, ALL, MUL #2\n"
+    "cntw x26, ALL, MUL #3\n"
+    "ptrue p0.b\n"
+    "whilelt p3.s, x9, %x[n_channels]\n"
+    "ld1rw { z6.s }, p0/Z, [%x[rescale_ptr]]\n"
+    "whilelt p2.s, x28, %x[n_channels]\n"
+    "whilelt p1.s, x27, %x[n_channels]\n"
+    "whilelt p0.s, x26, %x[n_channels]\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z5.b, #0x0\n"
+    "mov z4.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z3.b, #0x0\n"
+    "mov z2.b, #0x0\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "ld1w { z29.s }, p2/Z, [x23, x28, LSL #2]\n"
+    "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
+    "ld1w { z28.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+    "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+    "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+    "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 4 inputs loop
+    "fadd z23.s, z1.s, z0.s\n"
+    "fadd z19.s, z31.s, z30.s\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "fadd z22.s, z29.s, z22.s\n"
+    "fadd z18.s, z28.s, z18.s\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "fadd z21.s, z27.s, z21.s\n"
+    "fadd z17.s, z26.s, z17.s\n"
+    "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
+    "fadd z20.s, z25.s, z20.s\n"
+    "fadd z16.s, z24.s, z16.s\n"
+    "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+    "fadd z19.s, z23.s, z19.s\n"
+    "fadd z18.s, z22.s, z18.s\n"
+    "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+    "fadd z17.s, z21.s, z17.s\n"
+    "fadd z16.s, z20.s, z16.s\n"
+    "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "fadd z5.s, z5.s, z19.s\n"
+    "fadd z4.s, z4.s, z18.s\n"
+    "ld1w { z29.s }, p2/Z, [x23, x28, LSL #2]\n"
+    "fadd z3.s, z3.s, z17.s\n"
+    "fadd z2.s, z2.s, z16.s\n"
+    "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
+    "ld1w { z28.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+    "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+    "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+    "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 4 inputs tail
+    "fadd z23.s, z1.s, z0.s\n"
+    "fadd z19.s, z31.s, z30.s\n"
+    "fadd z22.s, z29.s, z22.s\n"
+    "fadd z18.s, z28.s, z18.s\n"
+    "fadd z21.s, z27.s, z21.s\n"
+    "fadd z17.s, z26.s, z17.s\n"
+    "fadd z20.s, z25.s, z20.s\n"
+    "fadd z16.s, z24.s, z16.s\n"
+    "fadd z19.s, z23.s, z19.s\n"
+    "fadd z18.s, z22.s, z18.s\n"
+    "fadd z17.s, z21.s, z17.s\n"
+    "fadd z16.s, z20.s, z16.s\n"
+    "fadd z5.s, z5.s, z19.s\n"
+    "fadd z4.s, z4.s, z18.s\n"
+    "fadd z3.s, z3.s, z17.s\n"
+    "fadd z2.s, z2.s, z16.s\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd z5.s, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "fadd z4.s, z4.s, z16.s\n"
+    "ld1w { z16.s }, p1/Z, [x20, x27, LSL #2]\n"
+    "fadd z3.s, z3.s, z16.s\n"
+    "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+    "fadd z2.s, z2.s, z16.s\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "fmul z5.s, z5.s, z6.s\n"
+    "fmul z4.s, z4.s, z6.s\n"
+    "st1w { z5.s }, p3, [%x[outptr], x9, LSL #2]\n"
+    "incw x9, ALL, MUL #4\n"
+    "fmul z3.s, z3.s, z6.s\n"
+    "fmul z2.s, z2.s, z6.s\n"
+    "st1w { z4.s }, p2, [%x[outptr], x28, LSL #2]\n"
+    "incw x28, ALL, MUL #4\n"
+    "st1w { z3.s }, p1, [%x[outptr], x27, LSL #2]\n"
+    "incw x27, ALL, MUL #4\n"
+    "st1w { z2.s }, p0, [%x[outptr], x26, LSL #2]\n"
+    "incw x26, ALL, MUL #4\n"
+    "whilelt p0.s, x26, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p3.s, x9, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z5.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x20, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1w { z1.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 4 inputs loop
+    "fadd z17.s, z1.s, z0.s\n"
+    "fadd z16.s, z31.s, z30.s\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "fadd z16.s, z17.s, z16.s\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fadd z5.s, z5.s, z16.s\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 4 inputs tail
+    "fadd z17.s, z1.s, z0.s\n"
+    "fadd z16.s, z31.s, z30.s\n"
+    "fadd z16.s, z17.s, z16.s\n"
+    "fadd z5.s, z5.s, z16.s\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd z5.s, z5.s, z16.s\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "fmul z5.s, z5.s, z6.s\n"
+    "st1w { z5.s }, p3, [%x[outptr], x9, LSL #2]\n"
+    "incw x9\n"
+    "whilelt p3.s, x9, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..338348231f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
+{
+  using Parent = DepthfirstStrategy<float, float>;
+
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
+
+  sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+  Parent::KernelType get_kernel(void) const { return sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..3c7213a498
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
+  const unsigned int n_channels,
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const bool exclude_padding,
+  const unsigned int pad_left,
+  const unsigned int pad_top,
+  const unsigned int pad_right,
+  const unsigned int pad_bottom
+)
+{
+  struct KernelArgs
+  {
+    const uint64_t n_channels;
+    const float *const *const inptrs;
+    float *const *const outptrs;
+    KernelArgs(
+      unsigned int channels,
+      const float *const *input_ptrs,
+      float *const * output_ptrs,
+      bool, unsigned int, unsigned int, unsigned int, unsigned int
+    ) : n_channels(channels),
+        inptrs(input_ptrs),
+        outptrs(output_ptrs)
+    {
+    }
+  };
+
+  const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+                        pad_left, pad_top, pad_right, pad_bottom);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x15, #0x0\n"
+    "ptrue p2.b\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "mov x14, #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_n_channels]]\n"
+    "whilelt p0.s, x15, x13\n"
+    "ldp x12, x11, [x21, #0x0]\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "ldp x28, x27, [x20, #0x0]\n"
+    "ld1w { z30.s }, p0/Z, [x27, x15, LSL #2]\n"
+    "ldp x26, x25, [x20, #0x10]\n"
+    "ld1w { z29.s }, p0/Z, [x25, x15, LSL #2]\n"
+    "ldp x24, x23, [x20, #0x20]\n"
+    "ld1w { z28.s }, p0/Z, [x24, x15, LSL #2]\n"
+    "ldp x22, x21, [x20, #0x30]\n"
+    "ld1w { z27.s }, p0/Z, [x21, x15, LSL #2]\n"
+    "ldr x20, [x20, #0x40]\n"
+    "ld1w { z26.s }, p0/Z, [x28, x15, LSL #2]\n"
+    "ld1w { z25.s }, p0/Z, [x26, x15, LSL #2]\n"
+    "ld1w { z24.s }, p0/Z, [x23, x15, LSL #2]\n"
+    "ld1w { z19.s }, p0/Z, [x22, x15, LSL #2]\n"
+    "ld1w { z23.s }, p0/Z, [x20, x15, LSL #2]\n"
+    "incw x15\n"
+    "whilelt p1.s, x15, x13\n"
+    "b.none 2f\n"
+    "1:"  // Vector: Loop
+    "movprfx z22, z30\n fmax z22.s, p2/M, z22.s, z28.s\n"
+    "movprfx z21, z28\n fmax z21.s, p2/M, z21.s, z27.s\n"
+    "ld1w { z30.s }, p1/Z, [x27, x15, LSL #2]\n"
+    "whilelt p0.s, x14, x13\n"
+    "movprfx z18, z29\n fmax z18.s, p2/M, z18.s, z26.s\n"
+    "movprfx z17, z25\n fmax z17.s, p2/M, z17.s, z24.s\n"
+    "ld1w { z28.s }, p1/Z, [x24, x15, LSL #2]\n"
+    "movprfx z16, z29\n fmax z16.s, p2/M, z16.s, z19.s\n"
+    "movprfx z20, z24\n fmax z20.s, p2/M, z20.s, z23.s\n"
+    "ld1w { z27.s }, p1/Z, [x21, x15, LSL #2]\n"
+    "ld1w { z29.s }, p1/Z, [x25, x15, LSL #2]\n"
+    "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z18.s\n"
+    "movprfx z18, z17\n fmax z18.s, p2/M, z18.s, z22.s\n"
+    "ld1w { z26.s }, p1/Z, [x28, x15, LSL #2]\n"
+    "movprfx z17, z16\n fmax z17.s, p2/M, z17.s, z21.s\n"
+    "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z20.s\n"
+    "ld1w { z25.s }, p1/Z, [x26, x15, LSL #2]\n"
+    "st1w { z19.s }, p0, [x12, x14, LSL #2]\n"
+    "ld1w { z24.s }, p1/Z, [x23, x15, LSL #2]\n"
+    "st1w { z18.s }, p0, [x11, x14, LSL #2]\n"
+    "ld1w { z19.s }, p1/Z, [x22, x15, LSL #2]\n"
+    "st1w { z17.s }, p0, [x10, x14, LSL #2]\n"
+    "ld1w { z23.s }, p1/Z, [x20, x15, LSL #2]\n"
+    "incw x15\n"
+    "whilelt p1.s, x15, x13\n"
+    "st1w { z16.s }, p0, [x9, x14, LSL #2]\n"
+    "incw x14\n"
+    "b.any 1b\n"
+    "2:"  // Vector: Tail
+    "movprfx z22, z30\n fmax z22.s, p2/M, z22.s, z28.s\n"
+    "movprfx z21, z28\n fmax z21.s, p2/M, z21.s, z27.s\n"
+    "whilelt p0.s, x14, x13\n"
+    "movprfx z20, z29\n fmax z20.s, p2/M, z20.s, z26.s\n"
+    "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z24.s\n"
+    "movprfx z17, z29\n fmax z17.s, p2/M, z17.s, z19.s\n"
+    "movprfx z19, z24\n fmax z19.s, p2/M, z19.s, z23.s\n"
+    "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
+    "fmax z18.s, p2/M, z18.s, z22.s\n"
+    "st1w { z16.s }, p0, [x12, x14, LSL #2]\n"
+    "fmax z17.s, p2/M, z17.s, z21.s\n"
+    "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z19.s\n"
+    "st1w { z18.s }, p0, [x11, x14, LSL #2]\n"
+    "st1w { z17.s }, p0, [x10, x14, LSL #2]\n"
+    "st1w { z16.s }, p0, [x9, x14, LSL #2]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..9bc1f11601
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
+
+struct sme_fp32_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
+{
+  using Parent = IGenericDepthfirstStrategy<float, float>;
+  sme_fp32_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_fp32_nhwc_max_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..0dabc2f292
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_fp32_nhwc_max_generic_depthfirst_impl(
+  const uint64_t,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const float *const *const inptrs,
+  float *outptr
+)
+{
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x9, #0x0\n"
+    "cntw x28\n"
+    "cntw x27, ALL, MUL #2\n"
+    "cntw x26, ALL, MUL #3\n"
+    "whilelt p4.s, x9, %x[n_channels]\n"
+    "whilelt p3.s, x28, %x[n_channels]\n"
+    "whilelt p2.s, x27, %x[n_channels]\n"
+    "whilelt p1.s, x26, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.s, #0xff800000\n"
+    "mov z3.s, #0xff800000\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z2.s, #0xff800000\n"
+    "mov z1.s, #0xff800000\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "ld1w { z18.s }, p3/Z, [x23, x28, LSL #2]\n"
+    "ld1w { z29.s }, p3/Z, [x22, x28, LSL #2]\n"
+    "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z28.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z17.s }, p2/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z27.s }, p2/Z, [x22, x27, LSL #2]\n"
+    "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z26.s }, p2/Z, [x20, x27, LSL #2]\n"
+    "ld1w { z16.s }, p1/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+    "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+    "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 4 inputs loop
+    "movprfx z19, z0\n fmax z19.s, p0/M, z19.s, z31.s\n"
+    "fmax z23.s, p0/M, z23.s, z30.s\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "fmax z18.s, p0/M, z18.s, z29.s\n"
+    "fmax z22.s, p0/M, z22.s, z28.s\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "fmax z17.s, p0/M, z17.s, z27.s\n"
+    "fmax z21.s, p0/M, z21.s, z26.s\n"
+    "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
+    "fmax z16.s, p0/M, z16.s, z25.s\n"
+    "fmax z20.s, p0/M, z20.s, z24.s\n"
+    "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+    "fmax z19.s, p0/M, z19.s, z23.s\n"
+    "fmax z18.s, p0/M, z18.s, z22.s\n"
+    "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+    "fmax z17.s, p0/M, z17.s, z21.s\n"
+    "fmax z16.s, p0/M, z16.s, z20.s\n"
+    "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "fmax z4.s, p0/M, z4.s, z19.s\n"
+    "fmax z3.s, p0/M, z3.s, z18.s\n"
+    "ld1w { z18.s }, p3/Z, [x23, x28, LSL #2]\n"
+    "fmax z2.s, p0/M, z2.s, z17.s\n"
+    "fmax z1.s, p0/M, z1.s, z16.s\n"
+    "ld1w { z29.s }, p3/Z, [x22, x28, LSL #2]\n"
+    "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z28.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z17.s }, p2/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z27.s }, p2/Z, [x22, x27, LSL #2]\n"
+    "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z26.s }, p2/Z, [x20, x27, LSL #2]\n"
+    "ld1w { z16.s }, p1/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+    "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+    "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 4 inputs tail
+    "movprfx z19, z0\n fmax z19.s, p0/M, z19.s, z31.s\n"
+    "fmax z23.s, p0/M, z23.s, z30.s\n"
+    "fmax z18.s, p0/M, z18.s, z29.s\n"
+    "fmax z22.s, p0/M, z22.s, z28.s\n"
+    "fmax z17.s, p0/M, z17.s, z27.s\n"
+    "fmax z21.s, p0/M, z21.s, z26.s\n"
+    "fmax z16.s, p0/M, z16.s, z25.s\n"
+    "fmax z20.s, p0/M, z20.s, z24.s\n"
+    "fmax z19.s, p0/M, z19.s, z23.s\n"
+    "fmax z18.s, p0/M, z18.s, z22.s\n"
+    "fmax z17.s, p0/M, z17.s, z21.s\n"
+    "fmax z16.s, p0/M, z16.s, z20.s\n"
+    "fmax z4.s, p0/M, z4.s, z19.s\n"
+    "fmax z3.s, p0/M, z3.s, z18.s\n"
+    "fmax z2.s, p0/M, z2.s, z17.s\n"
+    "fmax z1.s, p0/M, z1.s, z16.s\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax z4.s, p0/M, z4.s, z16.s\n"
+    "ld1w { z16.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "fmax z3.s, p0/M, z3.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x27, LSL #2]\n"
+    "fmax z2.s, p0/M, z2.s, z16.s\n"
+    "ld1w { z16.s }, p1/Z, [x20, x26, LSL #2]\n"
+    "fmax z1.s, p0/M, z1.s, z16.s\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "st1w { z4.s }, p4, [%x[outptr], x9, LSL #2]\n"
+    "incw x9, ALL, MUL #4\n"
+    "st1w { z3.s }, p3, [%x[outptr], x28, LSL #2]\n"
+    "incw x28, ALL, MUL #4\n"
+    "st1w { z2.s }, p2, [%x[outptr], x27, LSL #2]\n"
+    "incw x27, ALL, MUL #4\n"
+    "st1w { z1.s }, p1, [%x[outptr], x26, LSL #2]\n"
+    "incw x26, ALL, MUL #4\n"
+    "whilelt p1.s, x26, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.s, x9, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.s, #0xff800000\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x20, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1w { z0.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 4 inputs loop
+    "movprfx z16, z0\n fmax z16.s, p0/M, z16.s, z31.s\n"
+    "movprfx z17, z23\n fmax z17.s, p0/M, z17.s, z30.s\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "fmax z16.s, p0/M, z16.s, z17.s\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fmax z4.s, p0/M, z4.s, z16.s\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 4 inputs tail
+    "movprfx z16, z0\n fmax z16.s, p0/M, z16.s, z31.s\n"
+    "movprfx z17, z23\n fmax z17.s, p0/M, z17.s, z30.s\n"
+    "fmax z16.s, p0/M, z16.s, z17.s\n"
+    "fmax z4.s, p0/M, z4.s, z16.s\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax z4.s, p0/M, z4.s, z16.s\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "st1w { z4.s }, p4, [%x[outptr], x9, LSL #2]\n"
+    "incw x9\n"
+    "whilelt p4.s, x9, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..318510e697
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
+
+struct sme_s8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
+{
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
+  sme_s8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_s8_nhwc_avg_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c24e977dc6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+#include <cstring>
+#include <cmath>
+
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+namespace {
+  struct RescaleParams
+  {
+    int32_t multiplier, shift;
+  };
+
+  constexpr RescaleParams rescale_params[8] = {
+    {0x40000000, -0},  // 1/2
+    {0x55555556, -1},  // 1/3
+    {0x40000000, -1},  // 1/4
+    {0x66666666, -2},  // 1/5
+    {0x55555556, -2},  // 1/6
+    {0x49249249, -2},  // 1/7
+    {0x40000000, -2},  // 1/8
+    {0x71c71c72, -3},  // 1/9
+  };
+}
+
+void sme_s8_nhwc_avg_generic_depthfirst_impl(
+  const uint64_t window_cells,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const int8_t *const *const inptrs,
+  int8_t *outptr
+)
+{
+  if (n_valid_cells == 1 && window_cells == 1)
+  {
+    // In this case, simply copy from the input to the output
+    std::memcpy(outptr, *inptrs, n_channels);
+    return;
+  }
+
+  // Compute (or look up) the rescale values
+  int32_t shift_value = 0, rescale_value = 0;
+  if (2 <= window_cells && window_cells <= 9)
+  {
+    auto &params = rescale_params[window_cells - 2];
+    rescale_value = params.multiplier;
+    shift_value = params.shift;
+  }
+  else
+  {
+    auto f_rescale_value = 1.0f / static_cast<float>(window_cells);
+
+    shift_value = 0;
+    while (f_rescale_value < 0.5f)
+    {
+      shift_value--;
+      f_rescale_value *= 2.0f;
+    }
+
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
+    {
+      shift_value++;
+      long_rescale_value >>= 1;
+    }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
+  }
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x27, #0x0\n"
+    "cntb x26\n"
+    "cntb x25, ALL, MUL #2\n"
+    "cntb x24, ALL, MUL #3\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "whilelt p3.b, x26, %x[n_channels]\n"
+    "whilelt p2.b, x25, %x[n_channels]\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "mov z15.s, #0x0\n"
+    "mov z14.s, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "mov z13.s, #0x0\n"
+    "mov z12.s, #0x0\n"
+    "mov z11.s, #0x0\n"
+    "mov z10.s, #0x0\n"
+    "mov z9.s, #0x0\n"
+    "mov z8.s, #0x0\n"
+    "mov z7.s, #0x0\n"
+    "mov z6.s, #0x0\n"
+    "mov z5.s, #0x0\n"
+    "mov z4.s, #0x0\n"
+    "mov z3.s, #0x0\n"
+    "mov z2.s, #0x0\n"
+    "mov z1.s, #0x0\n"
+    "mov z0.s, #0x0\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 2 inputs loop
+    ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
+    ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
+    ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
+    ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
+    ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 2 inputs tail
+    ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
+    ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
+    ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
+    ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
+    ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
+    ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
+    ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
+    ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
+    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x1\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508a217  // sshllb z23.h, z16.b, #0x0\n"
+    ".inst 0x4508a616  // sshllt z22.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+    ".inst 0x4508a215  // sshllb z21.h, z16.b, #0x0\n"
+    ".inst 0x4508a614  // sshllt z20.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x4508a213  // sshllb z19.h, z16.b, #0x0\n"
+    ".inst 0x4508a612  // sshllt z18.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x4508a211  // sshllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508a610  // sshllt z16.h, z16.b, #0x0\n"
+    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+    ".inst 0x04b175ef  // sqdmulh z15.s, z15.s, z17.s\n"
+    ".inst 0x04b175ce  // sqdmulh z14.s, z14.s, z17.s\n"
+    ".inst 0x04b175ad  // sqdmulh z13.s, z13.s, z17.s\n"
+    ".inst 0x04b1758c  // sqdmulh z12.s, z12.s, z17.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
+    ".inst 0x04b1756b  // sqdmulh z11.s, z11.s, z17.s\n"
+    ".inst 0x04b1754a  // sqdmulh z10.s, z10.s, z17.s\n"
+    ".inst 0x04b17529  // sqdmulh z9.s, z9.s, z17.s\n"
+    ".inst 0x04b17508  // sqdmulh z8.s, z8.s, z17.s\n"
+    ".inst 0x04b174e7  // sqdmulh z7.s, z7.s, z17.s\n"
+    ".inst 0x04b174c6  // sqdmulh z6.s, z6.s, z17.s\n"
+    ".inst 0x04b174a5  // sqdmulh z5.s, z5.s, z17.s\n"
+    ".inst 0x04b17484  // sqdmulh z4.s, z4.s, z17.s\n"
+    ".inst 0x04b17463  // sqdmulh z3.s, z3.s, z17.s\n"
+    ".inst 0x04b17442  // sqdmulh z2.s, z2.s, z17.s\n"
+    ".inst 0x04b17421  // sqdmulh z1.s, z1.s, z17.s\n"
+    ".inst 0x04b17400  // sqdmulh z0.s, z0.s, z17.s\n"
+    "mov z19.s, #0x7f\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    ".inst 0x4482820b  // srshl z11.s, p0/M, z11.s, z16.s\n"
+    ".inst 0x4482820a  // srshl z10.s, p0/M, z10.s, z16.s\n"
+    ".inst 0x44828209  // srshl z9.s, p0/M, z9.s, z16.s\n"
+    ".inst 0x44828208  // srshl z8.s, p0/M, z8.s, z16.s\n"
+    ".inst 0x44828207  // srshl z7.s, p0/M, z7.s, z16.s\n"
+    ".inst 0x44828206  // srshl z6.s, p0/M, z6.s, z16.s\n"
+    ".inst 0x44828205  // srshl z5.s, p0/M, z5.s, z16.s\n"
+    ".inst 0x44828204  // srshl z4.s, p0/M, z4.s, z16.s\n"
+    ".inst 0x44828203  // srshl z3.s, p0/M, z3.s, z16.s\n"
+    ".inst 0x44828202  // srshl z2.s, p0/M, z2.s, z16.s\n"
+    ".inst 0x44828201  // srshl z1.s, p0/M, z1.s, z16.s\n"
+    ".inst 0x44828200  // srshl z0.s, p0/M, z0.s, z16.s\n"
+    "not z16.s, p0/M, z19.s\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smax z11.s, p0/M, z11.s, z16.s\n"
+    "smax z10.s, p0/M, z10.s, z16.s\n"
+    "smax z9.s, p0/M, z9.s, z16.s\n"
+    "smax z8.s, p0/M, z8.s, z16.s\n"
+    "smax z7.s, p0/M, z7.s, z16.s\n"
+    "smax z6.s, p0/M, z6.s, z16.s\n"
+    "smax z5.s, p0/M, z5.s, z16.s\n"
+    "smax z4.s, p0/M, z4.s, z16.s\n"
+    "smax z3.s, p0/M, z3.s, z16.s\n"
+    "smax z2.s, p0/M, z2.s, z16.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z19.s\n"
+    "smin z14.s, p0/M, z14.s, z19.s\n"
+    "trn1 z23.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z19.s\n"
+    "smin z12.s, p0/M, z12.s, z19.s\n"
+    "trn1 z16.h, z13.h, z12.h\n"
+    "smin z11.s, p0/M, z11.s, z19.s\n"
+    "smin z10.s, p0/M, z10.s, z19.s\n"
+    "trn1 z22.h, z11.h, z10.h\n"
+    "smin z9.s, p0/M, z9.s, z19.s\n"
+    "smin z8.s, p0/M, z8.s, z19.s\n"
+    "trn1 z18.h, z9.h, z8.h\n"
+    "smin z7.s, p0/M, z7.s, z19.s\n"
+    "smin z6.s, p0/M, z6.s, z19.s\n"
+    "trn1 z21.h, z7.h, z6.h\n"
+    "smin z5.s, p0/M, z5.s, z19.s\n"
+    "smin z4.s, p0/M, z4.s, z19.s\n"
+    "trn1 z17.h, z5.h, z4.h\n"
+    "smin z3.s, p0/M, z3.s, z19.s\n"
+    "smin z2.s, p0/M, z2.s, z19.s\n"
+    "trn1 z20.h, z3.h, z2.h\n"
+    "smin z1.s, p0/M, z1.s, z19.s\n"
+    "smin z0.s, p0/M, z0.s, z19.s\n"
+    "trn1 z19.h, z1.h, z0.h\n"
+    "trn1 z16.b, z23.b, z16.b\n"
+    "trn1 z18.b, z22.b, z18.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27, ALL, MUL #4\n"
+    "trn1 z17.b, z21.b, z17.b\n"
+    "trn1 z16.b, z20.b, z19.b\n"
+    "st1b { z18.b }, p3, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "st1b { z17.b }, p2, [%x[outptr], x25]\n"
+    "incb x25, ALL, MUL #4\n"
+    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+    "incb x24, ALL, MUL #4\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "mov z15.s, #0x0\n"
+    "mov z14.s, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "mov z13.s, #0x0\n"
+    "mov z12.s, #0x0\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 2 inputs loop
+    ".inst 0x455e03f1  // saddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e07f0  // saddlt z16.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 2 inputs tail
+    ".inst 0x455e03f1  // saddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e07f0  // saddlt z16.h, z31.b, z30.b\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x1\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508a211  // sshllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508a610  // sshllt z16.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "ld1rw { z16.s }, p0/Z, [%x[rescale_ptr]]\n"
+    ".inst 0x04b075ef  // sqdmulh z15.s, z15.s, z16.s\n"
+    ".inst 0x04b075ce  // sqdmulh z14.s, z14.s, z16.s\n"
+    ".inst 0x04b075ad  // sqdmulh z13.s, z13.s, z16.s\n"
+    ".inst 0x04b0758c  // sqdmulh z12.s, z12.s, z16.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
+    "mov z18.s, #0x7f\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z18.s\n"
+    "smin z14.s, p0/M, z14.s, z18.s\n"
+    "trn1 z17.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z18.s\n"
+    "smin z12.s, p0/M, z12.s, z18.s\n"
+    "trn1 z16.h, z13.h, z12.h\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..c9a80e6a5b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const int8_t *const *const, int8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<int8_t, int8_t>
+{
+  using Parent = DepthfirstStrategy<int8_t, int8_t>;
+
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
+
+  sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+  Parent::KernelType get_kernel(void) const { return sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..96617566a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const bool exclude_padding,
+  const unsigned int pad_left,
+  const unsigned int pad_top,
+  const unsigned int pad_right,
+  const unsigned int pad_bottom
+)
+{
+  struct KernelArgs
+  {
+    const uint64_t n_channels;
+    const int8_t *const *const inptrs;
+    int8_t *const *const outptrs;
+    KernelArgs(
+      unsigned int channels,
+      const int8_t *const *input_ptrs,
+      int8_t *const * output_ptrs,
+      bool, unsigned int, unsigned int, unsigned int, unsigned int
+    ) : n_channels(channels),
+        inptrs(input_ptrs),
+        outptrs(output_ptrs)
+    {
+    }
+  };
+
+  const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+                        pad_left, pad_top, pad_right, pad_bottom);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x15, #0x0\n"
+    "ptrue p2.b\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "mov x14, #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_n_channels]]\n"
+    "whilelt p0.b, x15, x13\n"
+    "ldp x12, x11, [x21, #0x0]\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "ldp x28, x27, [x20, #0x0]\n"
+    "ld1b { z30.b }, p0/Z, [x27, x15]\n"
+    "ldp x26, x25, [x20, #0x10]\n"
+    "ld1b { z29.b }, p0/Z, [x25, x15]\n"
+    "ldp x24, x23, [x20, #0x20]\n"
+    "ld1b { z28.b }, p0/Z, [x24, x15]\n"
+    "ldp x22, x21, [x20, #0x30]\n"
+    "ld1b { z27.b }, p0/Z, [x21, x15]\n"
+    "ldr x20, [x20, #0x40]\n"
+    "ld1b { z26.b }, p0/Z, [x28, x15]\n"
+    "ld1b { z25.b }, p0/Z, [x26, x15]\n"
+    "ld1b { z24.b }, p0/Z, [x23, x15]\n"
+    "ld1b { z19.b }, p0/Z, [x22, x15]\n"
+    "ld1b { z23.b }, p0/Z, [x20, x15]\n"
+    "incw x15\n"
+    "whilelt p1.b, x15, x13\n"
+    "b.none 2f\n"
+    "1:"  // Vector: Loop
+    "movprfx z22, z30\n smax z22.b, p2/M, z22.b, z28.b\n"
+    "movprfx z21, z28\n smax z21.b, p2/M, z21.b, z27.b\n"
+    "ld1b { z30.b }, p1/Z, [x27, x15]\n"
+    "whilelt p0.b, x14, x13\n"
+    "movprfx z18, z29\n smax z18.b, p2/M, z18.b, z26.b\n"
+    "movprfx z17, z25\n smax z17.b, p2/M, z17.b, z24.b\n"
+    "ld1b { z28.b }, p1/Z, [x24, x15]\n"
+    "movprfx z16, z29\n smax z16.b, p2/M, z16.b, z19.b\n"
+    "movprfx z20, z24\n smax z20.b, p2/M, z20.b, z23.b\n"
+    "ld1b { z27.b }, p1/Z, [x21, x15]\n"
+    "ld1b { z29.b }, p1/Z, [x25, x15]\n"
+    "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z18.b\n"
+    "movprfx z18, z17\n smax z18.b, p2/M, z18.b, z22.b\n"
+    "ld1b { z26.b }, p1/Z, [x28, x15]\n"
+    "movprfx z17, z16\n smax z17.b, p2/M, z17.b, z21.b\n"
+    "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z20.b\n"
+    "ld1b { z25.b }, p1/Z, [x26, x15]\n"
+    "st1b { z19.b }, p0, [x12, x14]\n"
+    "ld1b { z24.b }, p1/Z, [x23, x15]\n"
+    "st1b { z18.b }, p0, [x11, x14]\n"
+    "ld1b { z19.b }, p1/Z, [x22, x15]\n"
+    "st1b { z17.b }, p0, [x10, x14]\n"
+    "ld1b { z23.b }, p1/Z, [x20, x15]\n"
+    "incw x15\n"
+    "whilelt p1.b, x15, x13\n"
+    "st1b { z16.b }, p0, [x9, x14]\n"
+    "incw x14\n"
+    "b.any 1b\n"
+    "2:"  // Vector: Tail
+    "movprfx z22, z30\n smax z22.b, p2/M, z22.b, z28.b\n"
+    "movprfx z21, z28\n smax z21.b, p2/M, z21.b, z27.b\n"
+    "whilelt p0.b, x14, x13\n"
+    "movprfx z20, z29\n smax z20.b, p2/M, z20.b, z26.b\n"
+    "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z24.b\n"
+    "movprfx z17, z29\n smax z17.b, p2/M, z17.b, z19.b\n"
+    "movprfx z19, z24\n smax z19.b, p2/M, z19.b, z23.b\n"
+    "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
+    "smax z18.b, p2/M, z18.b, z22.b\n"
+    "st1b { z16.b }, p0, [x12, x14]\n"
+    "smax z17.b, p2/M, z17.b, z21.b\n"
+    "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z19.b\n"
+    "st1b { z18.b }, p0, [x11, x14]\n"
+    "st1b { z17.b }, p0, [x10, x14]\n"
+    "st1b { z16.b }, p0, [x9, x14]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..3e0d76c277
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
+
+struct sme_s8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
+{
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
+  sme_s8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_s8_nhwc_max_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d2b45cd353
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_s8_nhwc_max_generic_depthfirst_impl(
+  const uint64_t,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const int8_t *const *const inptrs,
+  int8_t *outptr
+)
+{
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x9, #0x0\n"
+    "cntb x28\n"
+    "cntb x27, ALL, MUL #2\n"
+    "cntb x26, ALL, MUL #3\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "whilelt p3.b, x28, %x[n_channels]\n"
+    "whilelt p2.b, x27, %x[n_channels]\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.b, #0x80\n"
+    "mov z3.b, #0x80\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z2.b, #0x80\n"
+    "mov z1.b, #0x80\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 4 inputs loop
+    "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
+    "smax z23.b, p0/M, z23.b, z30.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "smax z18.b, p0/M, z18.b, z29.b\n"
+    "smax z22.b, p0/M, z22.b, z28.b\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "smax z17.b, p0/M, z17.b, z27.b\n"
+    "smax z21.b, p0/M, z21.b, z26.b\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "smax z16.b, p0/M, z16.b, z25.b\n"
+    "smax z20.b, p0/M, z20.b, z24.b\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "smax z19.b, p0/M, z19.b, z23.b\n"
+    "smax z18.b, p0/M, z18.b, z22.b\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "smax z17.b, p0/M, z17.b, z21.b\n"
+    "smax z16.b, p0/M, z16.b, z20.b\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "smax z4.b, p0/M, z4.b, z19.b\n"
+    "smax z3.b, p0/M, z3.b, z18.b\n"
+    "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+    "smax z2.b, p0/M, z2.b, z17.b\n"
+    "smax z1.b, p0/M, z1.b, z16.b\n"
+    "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 4 inputs tail
+    "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
+    "smax z23.b, p0/M, z23.b, z30.b\n"
+    "smax z18.b, p0/M, z18.b, z29.b\n"
+    "smax z22.b, p0/M, z22.b, z28.b\n"
+    "smax z17.b, p0/M, z17.b, z27.b\n"
+    "smax z21.b, p0/M, z21.b, z26.b\n"
+    "smax z16.b, p0/M, z16.b, z25.b\n"
+    "smax z20.b, p0/M, z20.b, z24.b\n"
+    "smax z19.b, p0/M, z19.b, z23.b\n"
+    "smax z18.b, p0/M, z18.b, z22.b\n"
+    "smax z17.b, p0/M, z17.b, z21.b\n"
+    "smax z16.b, p0/M, z16.b, z20.b\n"
+    "smax z4.b, p0/M, z4.b, z19.b\n"
+    "smax z3.b, p0/M, z3.b, z18.b\n"
+    "smax z2.b, p0/M, z2.b, z17.b\n"
+    "smax z1.b, p0/M, z1.b, z16.b\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "smax z4.b, p0/M, z4.b, z16.b\n"
+    "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+    "smax z3.b, p0/M, z3.b, z16.b\n"
+    "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+    "smax z2.b, p0/M, z2.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+    "smax z1.b, p0/M, z1.b, z16.b\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "st1b { z4.b }, p4, [%x[outptr], x9]\n"
+    "incb x9, ALL, MUL #4\n"
+    "st1b { z3.b }, p3, [%x[outptr], x28]\n"
+    "incb x28, ALL, MUL #4\n"
+    "st1b { z2.b }, p2, [%x[outptr], x27]\n"
+    "incb x27, ALL, MUL #4\n"
+    "st1b { z1.b }, p1, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.b, #0x80\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x20, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 4 inputs loop
+    "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+    "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "smax z16.b, p0/M, z16.b, z17.b\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "smax z4.b, p0/M, z4.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 4 inputs tail
+    "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+    "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+    "smax z16.b, p0/M, z16.b, z17.b\n"
+    "smax z4.b, p0/M, z4.b, z16.b\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "smax z4.b, p0/M, z4.b, z16.b\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "st1b { z4.b }, p4, [%x[outptr], x9]\n"
+    "incb x9\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..c6263f5dbc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
+
+struct sme_s8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
+{
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
+  sme_s8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_s8q_nhwc_avg_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..91f2f7ab31
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pooling.hpp"
+#include <cstdint>
+#include <cstddef>
+#include <cstring>
+#include <cmath>
+
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+namespace {
+  struct RescaleParams
+  {
+    int32_t multiplier, shift;
+  };
+
+  constexpr RescaleParams rescale_params[8] = {
+    {0x40000000, -0},  // 1/2
+    {0x55555556, -1},  // 1/3
+    {0x40000000, -1},  // 1/4
+    {0x66666666, -2},  // 1/5
+    {0x55555556, -2},  // 1/6
+    {0x49249249, -2},  // 1/7
+    {0x40000000, -2},  // 1/8
+    {0x71c71c72, -3},  // 1/9
+  };
+}
+
+void sme_s8q_nhwc_avg_generic_depthfirst_impl(
+  const uint64_t window_cells,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const int8_t *const *const inptrs,
+  int8_t *outptr,
+  const Requantize32 &qp
+)
+{
+  if (n_valid_cells == 1 && window_cells == 1)
+  {
+    // In this case, simply copy from the input to the output
+    std::memcpy(outptr, *inptrs, n_channels);
+    return;
+  }
+
+  // Compute (or look up) the rescale values
+  int32_t shift_value = 0, rescale_value = 0;
+  if (2 <= window_cells && window_cells <= 9)
+  {
+    auto &params = rescale_params[window_cells - 2];
+    rescale_value = params.multiplier;
+    shift_value = params.shift;
+  }
+  else
+  {
+    auto f_rescale_value = 1.0f / static_cast<float>(window_cells);
+
+    shift_value = 0;
+    while (f_rescale_value < 0.5f)
+    {
+      shift_value--;
+      f_rescale_value *= 2.0f;
+    }
+
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
+    {
+      shift_value++;
+      long_rescale_value >>= 1;
+    }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
+  }
+
+  // Combine together the rescale value for the requantization and the scaling
+  // factor for the average pool.
+  const int32_t shift = qp.per_layer_left_shift - qp.per_layer_right_shift + shift_value;
+  const int32_t left_shift = shift > 0 ? shift : 0;
+  const int32_t right_shift = shift <= 0 ? shift : 0;
+
+  int32_t combined_rescale_value = 0;
+  __asm__ __volatile__ (
+      "mov v16.s[0], %w[per_layer_mul]\n"
+      "mov v17.s[0], %w[rescale_value]\n"
+      "sqrdmulh s18, s16, s17\n"
+      "mov %w[combined_rescale_value], v18.s[0]\n"
+    : [combined_rescale_value] "=r" (combined_rescale_value)
+    : [per_layer_mul] "r" (qp.per_layer_mul), [rescale_value] "r" (rescale_value)
+    : "v16", "v17", "v18"
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x27, #0x0\n"
+    "cntb x26\n"
+    "cntb x25, ALL, MUL #2\n"
+    "cntb x24, ALL, MUL #3\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "whilelt p3.b, x26, %x[n_channels]\n"
+    "whilelt p2.b, x25, %x[n_channels]\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "mov z15.s, #0x0\n"
+    "mov z14.s, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "mov z13.s, #0x0\n"
+    "mov z12.s, #0x0\n"
+    "mov z11.s, #0x0\n"
+    "mov z10.s, #0x0\n"
+    "mov z9.s, #0x0\n"
+    "mov z8.s, #0x0\n"
+    "mov z7.s, #0x0\n"
+    "mov z6.s, #0x0\n"
+    "mov z5.s, #0x0\n"
+    "mov z4.s, #0x0\n"
+    "mov z3.s, #0x0\n"
+    "mov z2.s, #0x0\n"
+    "mov z1.s, #0x0\n"
+    "mov z0.s, #0x0\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 2 inputs loop
+    ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
+    ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
+    ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
+    ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
+    ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 2 inputs tail
+    ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
+    ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
+    ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
+    ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
+    ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
+    ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
+    ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
+    ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
+    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x1\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508a217  // sshllb z23.h, z16.b, #0x0\n"
+    ".inst 0x4508a616  // sshllt z22.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+    ".inst 0x4508a215  // sshllb z21.h, z16.b, #0x0\n"
+    ".inst 0x4508a614  // sshllt z20.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x4508a213  // sshllb z19.h, z16.b, #0x0\n"
+    ".inst 0x4508a612  // sshllt z18.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x4508a211  // sshllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508a610  // sshllt z16.h, z16.b, #0x0\n"
+    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
+    ".inst 0x4482824f  // srshl z15.s, p0/M, z15.s, z18.s\n"
+    ".inst 0x4482824e  // srshl z14.s, p0/M, z14.s, z18.s\n"
+    ".inst 0x4482824d  // srshl z13.s, p0/M, z13.s, z18.s\n"
+    ".inst 0x4482824c  // srshl z12.s, p0/M, z12.s, z18.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
+    ".inst 0x4482824b  // srshl z11.s, p0/M, z11.s, z18.s\n"
+    ".inst 0x4482824a  // srshl z10.s, p0/M, z10.s, z18.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
+    ".inst 0x44828249  // srshl z9.s, p0/M, z9.s, z18.s\n"
+    ".inst 0x44828248  // srshl z8.s, p0/M, z8.s, z18.s\n"
+    ".inst 0x44828247  // srshl z7.s, p0/M, z7.s, z18.s\n"
+    ".inst 0x44828246  // srshl z6.s, p0/M, z6.s, z18.s\n"
+    ".inst 0x44828245  // srshl z5.s, p0/M, z5.s, z18.s\n"
+    ".inst 0x44828244  // srshl z4.s, p0/M, z4.s, z18.s\n"
+    ".inst 0x44828243  // srshl z3.s, p0/M, z3.s, z18.s\n"
+    ".inst 0x44828242  // srshl z2.s, p0/M, z2.s, z18.s\n"
+    ".inst 0x44828241  // srshl z1.s, p0/M, z1.s, z18.s\n"
+    ".inst 0x44828240  // srshl z0.s, p0/M, z0.s, z18.s\n"
+    ".inst 0x04b175ef  // sqrdmulh z15.s, z15.s, z17.s\n"
+    ".inst 0x04b175ce  // sqrdmulh z14.s, z14.s, z17.s\n"
+    ".inst 0x04b175ad  // sqrdmulh z13.s, z13.s, z17.s\n"
+    ".inst 0x04b1758c  // sqrdmulh z12.s, z12.s, z17.s\n"
+    ".inst 0x04b1756b  // sqrdmulh z11.s, z11.s, z17.s\n"
+    ".inst 0x04b1754a  // sqrdmulh z10.s, z10.s, z17.s\n"
+    ".inst 0x04b17529  // sqrdmulh z9.s, z9.s, z17.s\n"
+    ".inst 0x04b17508  // sqrdmulh z8.s, z8.s, z17.s\n"
+    ".inst 0x04b174e7  // sqrdmulh z7.s, z7.s, z17.s\n"
+    ".inst 0x04b174c6  // sqrdmulh z6.s, z6.s, z17.s\n"
+    ".inst 0x04b174a5  // sqrdmulh z5.s, z5.s, z17.s\n"
+    ".inst 0x04b17484  // sqrdmulh z4.s, z4.s, z17.s\n"
+    ".inst 0x04b17463  // sqrdmulh z3.s, z3.s, z17.s\n"
+    ".inst 0x04b17442  // sqrdmulh z2.s, z2.s, z17.s\n"
+    ".inst 0x04b17421  // sqrdmulh z1.s, z1.s, z17.s\n"
+    ".inst 0x04b17400  // sqrdmulh z0.s, z0.s, z17.s\n"
+    "mov z19.s, #0x7f\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    ".inst 0x4482820b  // srshl z11.s, p0/M, z11.s, z16.s\n"
+    ".inst 0x4482820a  // srshl z10.s, p0/M, z10.s, z16.s\n"
+    ".inst 0x44828209  // srshl z9.s, p0/M, z9.s, z16.s\n"
+    ".inst 0x44828208  // srshl z8.s, p0/M, z8.s, z16.s\n"
+    ".inst 0x44828207  // srshl z7.s, p0/M, z7.s, z16.s\n"
+    ".inst 0x44828206  // srshl z6.s, p0/M, z6.s, z16.s\n"
+    ".inst 0x44828205  // srshl z5.s, p0/M, z5.s, z16.s\n"
+    ".inst 0x44828204  // srshl z4.s, p0/M, z4.s, z16.s\n"
+    ".inst 0x44828203  // srshl z3.s, p0/M, z3.s, z16.s\n"
+    ".inst 0x44828202  // srshl z2.s, p0/M, z2.s, z16.s\n"
+    ".inst 0x44828201  // srshl z1.s, p0/M, z1.s, z16.s\n"
+    ".inst 0x44828200  // srshl z0.s, p0/M, z0.s, z16.s\n"
+    "not z16.s, p0/M, z19.s\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smax z11.s, p0/M, z11.s, z16.s\n"
+    "smax z10.s, p0/M, z10.s, z16.s\n"
+    "smax z9.s, p0/M, z9.s, z16.s\n"
+    "smax z8.s, p0/M, z8.s, z16.s\n"
+    "smax z7.s, p0/M, z7.s, z16.s\n"
+    "smax z6.s, p0/M, z6.s, z16.s\n"
+    "smax z5.s, p0/M, z5.s, z16.s\n"
+    "smax z4.s, p0/M, z4.s, z16.s\n"
+    "smax z3.s, p0/M, z3.s, z16.s\n"
+    "smax z2.s, p0/M, z2.s, z16.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z19.s\n"
+    "smin z14.s, p0/M, z14.s, z19.s\n"
+    "trn1 z23.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z19.s\n"
+    "smin z12.s, p0/M, z12.s, z19.s\n"
+    "trn1 z16.h, z13.h, z12.h\n"
+    "smin z11.s, p0/M, z11.s, z19.s\n"
+    "smin z10.s, p0/M, z10.s, z19.s\n"
+    "trn1 z22.h, z11.h, z10.h\n"
+    "smin z9.s, p0/M, z9.s, z19.s\n"
+    "smin z8.s, p0/M, z8.s, z19.s\n"
+    "trn1 z18.h, z9.h, z8.h\n"
+    "smin z7.s, p0/M, z7.s, z19.s\n"
+    "smin z6.s, p0/M, z6.s, z19.s\n"
+    "trn1 z21.h, z7.h, z6.h\n"
+    "smin z5.s, p0/M, z5.s, z19.s\n"
+    "smin z4.s, p0/M, z4.s, z19.s\n"
+    "trn1 z17.h, z5.h, z4.h\n"
+    "smin z3.s, p0/M, z3.s, z19.s\n"
+    "smin z2.s, p0/M, z2.s, z19.s\n"
+    "trn1 z20.h, z3.h, z2.h\n"
+    "smin z1.s, p0/M, z1.s, z19.s\n"
+    "smin z0.s, p0/M, z0.s, z19.s\n"
+    "trn1 z19.h, z1.h, z0.h\n"
+    "trn1 z16.b, z23.b, z16.b\n"
+    "trn1 z18.b, z22.b, z18.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27, ALL, MUL #4\n"
+    "trn1 z17.b, z21.b, z17.b\n"
+    "trn1 z16.b, z20.b, z19.b\n"
+    "st1b { z18.b }, p3, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "st1b { z17.b }, p2, [%x[outptr], x25]\n"
+    "incb x25, ALL, MUL #4\n"
+    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+    "incb x24, ALL, MUL #4\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "mov z15.s, #0x0\n"
+    "mov z14.s, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "mov z13.s, #0x0\n"
+    "mov z12.s, #0x0\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 2 inputs loop
+    ".inst 0x455e03f1  // saddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e07f0  // saddlt z16.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 2 inputs tail
+    ".inst 0x455e03f1  // saddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e07f0  // saddlt z16.h, z31.b, z30.b\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x1\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508a211  // sshllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508a610  // sshllt z16.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
+    ".inst 0x04b175ef  // sqrdmulh z15.s, z15.s, z17.s\n"
+    ".inst 0x04b175ce  // sqrdmulh z14.s, z14.s, z17.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
+    ".inst 0x04b175ad  // sqrdmulh z13.s, z13.s, z17.s\n"
+    ".inst 0x04b1758c  // sqrdmulh z12.s, z12.s, z17.s\n"
+    "mov z18.s, #0x7f\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z18.s\n"
+    "smin z14.s, p0/M, z14.s, z18.s\n"
+    "trn1 z17.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z18.s\n"
+    "smin z12.s, p0/M, z12.s, z18.s\n"
+    "trn1 z16.h, z13.h, z12.h\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [right_shift] "r" (&right_shift)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..9667d37954
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
+
+struct sme_s8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
+{
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
+  sme_s8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_s8q_nhwc_max_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..e9b586f4ce
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pooling.hpp"
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_s8q_nhwc_max_generic_depthfirst_impl(
+  const uint64_t,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const int8_t *const *const inptrs,
+  int8_t *outptr,
+  const Requantize32 &qp
+)
+{
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x9, #0x0\n"
+    "cntb x28\n"
+    "cntb x27, ALL, MUL #2\n"
+    "cntb x26, ALL, MUL #3\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "whilelt p3.b, x28, %x[n_channels]\n"
+    "whilelt p2.b, x27, %x[n_channels]\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.b, #0x80\n"
+    "mov z3.b, #0x80\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z2.b, #0x80\n"
+    "mov z1.b, #0x80\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 4 inputs loop
+    "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
+    "smax z23.b, p0/M, z23.b, z30.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "smax z18.b, p0/M, z18.b, z29.b\n"
+    "smax z22.b, p0/M, z22.b, z28.b\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "smax z17.b, p0/M, z17.b, z27.b\n"
+    "smax z21.b, p0/M, z21.b, z26.b\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "smax z16.b, p0/M, z16.b, z25.b\n"
+    "smax z20.b, p0/M, z20.b, z24.b\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "smax z19.b, p0/M, z19.b, z23.b\n"
+    "smax z18.b, p0/M, z18.b, z22.b\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "smax z17.b, p0/M, z17.b, z21.b\n"
+    "smax z16.b, p0/M, z16.b, z20.b\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "smax z4.b, p0/M, z4.b, z19.b\n"
+    "smax z3.b, p0/M, z3.b, z18.b\n"
+    "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+    "smax z2.b, p0/M, z2.b, z17.b\n"
+    "smax z1.b, p0/M, z1.b, z16.b\n"
+    "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 4 inputs tail
+    "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
+    "smax z23.b, p0/M, z23.b, z30.b\n"
+    "smax z18.b, p0/M, z18.b, z29.b\n"
+    "smax z22.b, p0/M, z22.b, z28.b\n"
+    "smax z17.b, p0/M, z17.b, z27.b\n"
+    "smax z21.b, p0/M, z21.b, z26.b\n"
+    "smax z16.b, p0/M, z16.b, z25.b\n"
+    "smax z20.b, p0/M, z20.b, z24.b\n"
+    "smax z19.b, p0/M, z19.b, z23.b\n"
+    "smax z18.b, p0/M, z18.b, z22.b\n"
+    "smax z17.b, p0/M, z17.b, z21.b\n"
+    "smax z16.b, p0/M, z16.b, z20.b\n"
+    "smax z4.b, p0/M, z4.b, z19.b\n"
+    "smax z3.b, p0/M, z3.b, z18.b\n"
+    "smax z2.b, p0/M, z2.b, z17.b\n"
+    "smax z1.b, p0/M, z1.b, z16.b\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "smax z4.b, p0/M, z4.b, z16.b\n"
+    "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+    "smax z3.b, p0/M, z3.b, z16.b\n"
+    "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+    "smax z2.b, p0/M, z2.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+    "smax z1.b, p0/M, z1.b, z16.b\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    ".inst 0x4508a097  // sshllb z23.h, z4.b, #0x0\n"
+    ".inst 0x4508a496  // sshllt z22.h, z4.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1rw { z4.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a075  // sshllb z21.h, z3.b, #0x0\n"
+    ".inst 0x4508a472  // sshllt z18.h, z3.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1rw { z3.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a054  // sshllb z20.h, z2.b, #0x0\n"
+    ".inst 0x4508a451  // sshllt z17.h, z2.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1rw { z2.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a033  // sshllb z19.h, z1.b, #0x0\n"
+    ".inst 0x4508a430  // sshllt z16.h, z1.b, #0x0\n"
+    ".inst 0x4510a2e1  // sshllb z1.s, z23.h, #0x0\n"
+    ".inst 0x4510a6f7  // sshllt z23.s, z23.h, #0x0\n"
+    ".inst 0x4510a2c0  // sshllb z0.s, z22.h, #0x0\n"
+    ".inst 0x4510a6df  // sshllt z31.s, z22.h, #0x0\n"
+    ".inst 0x4510a2be  // sshllb z30.s, z21.h, #0x0\n"
+    ".inst 0x4510a6b6  // sshllt z22.s, z21.h, #0x0\n"
+    ".inst 0x4510a25d  // sshllb z29.s, z18.h, #0x0\n"
+    ".inst 0x4510a652  // sshllt z18.s, z18.h, #0x0\n"
+    ".inst 0x4510a29c  // sshllb z28.s, z20.h, #0x0\n"
+    ".inst 0x4510a695  // sshllt z21.s, z20.h, #0x0\n"
+    ".inst 0x4510a23b  // sshllb z27.s, z17.h, #0x0\n"
+    ".inst 0x4510a631  // sshllt z17.s, z17.h, #0x0\n"
+    ".inst 0x4510a27a  // sshllb z26.s, z19.h, #0x0\n"
+    ".inst 0x4510a674  // sshllt z20.s, z19.h, #0x0\n"
+    ".inst 0x4510a219  // sshllb z25.s, z16.h, #0x0\n"
+    ".inst 0x4510a618  // sshllt z24.s, z16.h, #0x0\n"
+    ".inst 0x44828081  // srshl z1.s, p0/M, z1.s, z4.s\n"
+    ".inst 0x44828097  // srshl z23.s, p0/M, z23.s, z4.s\n"
+    ".inst 0x44828080  // srshl z0.s, p0/M, z0.s, z4.s\n"
+    ".inst 0x4482809f  // srshl z31.s, p0/M, z31.s, z4.s\n"
+    ".inst 0x4482809e  // srshl z30.s, p0/M, z30.s, z4.s\n"
+    ".inst 0x44828096  // srshl z22.s, p0/M, z22.s, z4.s\n"
+    ".inst 0x4482809d  // srshl z29.s, p0/M, z29.s, z4.s\n"
+    ".inst 0x44828092  // srshl z18.s, p0/M, z18.s, z4.s\n"
+    ".inst 0x4482809c  // srshl z28.s, p0/M, z28.s, z4.s\n"
+    ".inst 0x44828095  // srshl z21.s, p0/M, z21.s, z4.s\n"
+    ".inst 0x4482809b  // srshl z27.s, p0/M, z27.s, z4.s\n"
+    ".inst 0x44828091  // srshl z17.s, p0/M, z17.s, z4.s\n"
+    ".inst 0x4482809a  // srshl z26.s, p0/M, z26.s, z4.s\n"
+    ".inst 0x44828094  // srshl z20.s, p0/M, z20.s, z4.s\n"
+    ".inst 0x44828099  // srshl z25.s, p0/M, z25.s, z4.s\n"
+    ".inst 0x44828098  // srshl z24.s, p0/M, z24.s, z4.s\n"
+    ".inst 0x04a37421  // sqrdmulh z1.s, z1.s, z3.s\n"
+    ".inst 0x04a376f7  // sqrdmulh z23.s, z23.s, z3.s\n"
+    ".inst 0x04a37400  // sqrdmulh z0.s, z0.s, z3.s\n"
+    ".inst 0x04a377ff  // sqrdmulh z31.s, z31.s, z3.s\n"
+    ".inst 0x04a377de  // sqrdmulh z30.s, z30.s, z3.s\n"
+    ".inst 0x04a376d6  // sqrdmulh z22.s, z22.s, z3.s\n"
+    ".inst 0x04a377bd  // sqrdmulh z29.s, z29.s, z3.s\n"
+    ".inst 0x04a37652  // sqrdmulh z18.s, z18.s, z3.s\n"
+    ".inst 0x04a3779c  // sqrdmulh z28.s, z28.s, z3.s\n"
+    ".inst 0x04a376b5  // sqrdmulh z21.s, z21.s, z3.s\n"
+    ".inst 0x04a3777b  // sqrdmulh z27.s, z27.s, z3.s\n"
+    ".inst 0x04a37631  // sqrdmulh z17.s, z17.s, z3.s\n"
+    ".inst 0x04a3775a  // sqrdmulh z26.s, z26.s, z3.s\n"
+    ".inst 0x04a37694  // sqrdmulh z20.s, z20.s, z3.s\n"
+    ".inst 0x04a37739  // sqrdmulh z25.s, z25.s, z3.s\n"
+    ".inst 0x04a37718  // sqrdmulh z24.s, z24.s, z3.s\n"
+    "mov z19.s, #0x7f\n"
+    ".inst 0x44828041  // srshl z1.s, p0/M, z1.s, z2.s\n"
+    ".inst 0x44828057  // srshl z23.s, p0/M, z23.s, z2.s\n"
+    ".inst 0x44828040  // srshl z0.s, p0/M, z0.s, z2.s\n"
+    ".inst 0x4482805f  // srshl z31.s, p0/M, z31.s, z2.s\n"
+    ".inst 0x4482805e  // srshl z30.s, p0/M, z30.s, z2.s\n"
+    ".inst 0x44828056  // srshl z22.s, p0/M, z22.s, z2.s\n"
+    ".inst 0x4482805d  // srshl z29.s, p0/M, z29.s, z2.s\n"
+    ".inst 0x44828052  // srshl z18.s, p0/M, z18.s, z2.s\n"
+    ".inst 0x4482805c  // srshl z28.s, p0/M, z28.s, z2.s\n"
+    ".inst 0x44828055  // srshl z21.s, p0/M, z21.s, z2.s\n"
+    ".inst 0x4482805b  // srshl z27.s, p0/M, z27.s, z2.s\n"
+    ".inst 0x44828051  // srshl z17.s, p0/M, z17.s, z2.s\n"
+    ".inst 0x4482805a  // srshl z26.s, p0/M, z26.s, z2.s\n"
+    ".inst 0x44828054  // srshl z20.s, p0/M, z20.s, z2.s\n"
+    ".inst 0x44828059  // srshl z25.s, p0/M, z25.s, z2.s\n"
+    ".inst 0x44828058  // srshl z24.s, p0/M, z24.s, z2.s\n"
+    "not z16.s, p0/M, z19.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z23.s, p0/M, z23.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smax z31.s, p0/M, z31.s, z16.s\n"
+    "smax z30.s, p0/M, z30.s, z16.s\n"
+    "smax z22.s, p0/M, z22.s, z16.s\n"
+    "smax z29.s, p0/M, z29.s, z16.s\n"
+    "smax z18.s, p0/M, z18.s, z16.s\n"
+    "smax z28.s, p0/M, z28.s, z16.s\n"
+    "smax z21.s, p0/M, z21.s, z16.s\n"
+    "smax z27.s, p0/M, z27.s, z16.s\n"
+    "smax z17.s, p0/M, z17.s, z16.s\n"
+    "smax z26.s, p0/M, z26.s, z16.s\n"
+    "smax z20.s, p0/M, z20.s, z16.s\n"
+    "smax z25.s, p0/M, z25.s, z16.s\n"
+    "smax z24.s, p0/M, z24.s, z16.s\n"
+    "smin z1.s, p0/M, z1.s, z19.s\n"
+    "smin z23.s, p0/M, z23.s, z19.s\n"
+    "trn1 z23.h, z1.h, z23.h\n"
+    "smin z0.s, p0/M, z0.s, z19.s\n"
+    "smin z31.s, p0/M, z31.s, z19.s\n"
+    "trn1 z16.h, z0.h, z31.h\n"
+    "smin z30.s, p0/M, z30.s, z19.s\n"
+    "smin z22.s, p0/M, z22.s, z19.s\n"
+    "trn1 z22.h, z30.h, z22.h\n"
+    "smin z29.s, p0/M, z29.s, z19.s\n"
+    "smin z18.s, p0/M, z18.s, z19.s\n"
+    "trn1 z18.h, z29.h, z18.h\n"
+    "smin z28.s, p0/M, z28.s, z19.s\n"
+    "smin z21.s, p0/M, z21.s, z19.s\n"
+    "trn1 z21.h, z28.h, z21.h\n"
+    "smin z27.s, p0/M, z27.s, z19.s\n"
+    "smin z17.s, p0/M, z17.s, z19.s\n"
+    "trn1 z17.h, z27.h, z17.h\n"
+    "smin z26.s, p0/M, z26.s, z19.s\n"
+    "smin z20.s, p0/M, z20.s, z19.s\n"
+    "trn1 z20.h, z26.h, z20.h\n"
+    "smin z25.s, p0/M, z25.s, z19.s\n"
+    "smin z24.s, p0/M, z24.s, z19.s\n"
+    "trn1 z19.h, z25.h, z24.h\n"
+    "trn1 z16.b, z23.b, z16.b\n"
+    "trn1 z18.b, z22.b, z18.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+    "incb x9, ALL, MUL #4\n"
+    "trn1 z17.b, z21.b, z17.b\n"
+    "trn1 z16.b, z20.b, z19.b\n"
+    "st1b { z18.b }, p3, [%x[outptr], x28]\n"
+    "incb x28, ALL, MUL #4\n"
+    "st1b { z17.b }, p2, [%x[outptr], x27]\n"
+    "incb x27, ALL, MUL #4\n"
+    "st1b { z16.b }, p1, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.b, #0x80\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x20, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 4 inputs loop
+    "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+    "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "smax z16.b, p0/M, z16.b, z17.b\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "smax z4.b, p0/M, z4.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 4 inputs tail
+    "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+    "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+    "smax z16.b, p0/M, z16.b, z17.b\n"
+    "smax z4.b, p0/M, z4.b, z16.b\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "smax z4.b, p0/M, z4.b, z16.b\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    ".inst 0x4508a091  // sshllb z17.h, z4.b, #0x0\n"
+    ".inst 0x4508a490  // sshllt z16.h, z4.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1rw { z18.s }, p0/Z, [x20]\n"
+    ".inst 0x4510a236  // sshllb z22.s, z17.h, #0x0\n"
+    ".inst 0x4510a635  // sshllt z21.s, z17.h, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1rw { z17.s }, p0/Z, [x20]\n"
+    ".inst 0x4510a214  // sshllb z20.s, z16.h, #0x0\n"
+    ".inst 0x4510a613  // sshllt z19.s, z16.h, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x44828256  // srshl z22.s, p0/M, z22.s, z18.s\n"
+    ".inst 0x44828255  // srshl z21.s, p0/M, z21.s, z18.s\n"
+    ".inst 0x44828254  // srshl z20.s, p0/M, z20.s, z18.s\n"
+    ".inst 0x44828253  // srshl z19.s, p0/M, z19.s, z18.s\n"
+    ".inst 0x04b176d6  // sqrdmulh z22.s, z22.s, z17.s\n"
+    ".inst 0x04b176b5  // sqrdmulh z21.s, z21.s, z17.s\n"
+    ".inst 0x04b17694  // sqrdmulh z20.s, z20.s, z17.s\n"
+    ".inst 0x04b17673  // sqrdmulh z19.s, z19.s, z17.s\n"
+    "mov z18.s, #0x7f\n"
+    ".inst 0x44828216  // srshl z22.s, p0/M, z22.s, z16.s\n"
+    ".inst 0x44828215  // srshl z21.s, p0/M, z21.s, z16.s\n"
+    ".inst 0x44828214  // srshl z20.s, p0/M, z20.s, z16.s\n"
+    ".inst 0x44828213  // srshl z19.s, p0/M, z19.s, z16.s\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z22.s, p0/M, z22.s, z16.s\n"
+    "smax z21.s, p0/M, z21.s, z16.s\n"
+    "smax z20.s, p0/M, z20.s, z16.s\n"
+    "smax z19.s, p0/M, z19.s, z16.s\n"
+    "smin z22.s, p0/M, z22.s, z18.s\n"
+    "smin z21.s, p0/M, z21.s, z18.s\n"
+    "trn1 z17.h, z22.h, z21.h\n"
+    "smin z20.s, p0/M, z20.s, z18.s\n"
+    "smin z19.s, p0/M, z19.s, z18.s\n"
+    "trn1 z16.h, z20.h, z19.h\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+    "incb x9\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [outptr] "r" (outptr), [quant_params] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..29a03ec509
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
+
+struct sme_u8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
+{
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
+  sme_u8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_u8_nhwc_avg_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..f0e7bbf5cc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+#include <cstring>
+#include <cmath>
+
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+namespace {
+  struct RescaleParams
+  {
+    int32_t multiplier, shift;
+  };
+
+  constexpr RescaleParams rescale_params[8] = {
+    {0x40000000, -0},  // 1/2
+    {0x55555556, -1},  // 1/3
+    {0x40000000, -1},  // 1/4
+    {0x66666666, -2},  // 1/5
+    {0x55555556, -2},  // 1/6
+    {0x49249249, -2},  // 1/7
+    {0x40000000, -2},  // 1/8
+    {0x71c71c72, -3},  // 1/9
+  };
+}
+
+void sme_u8_nhwc_avg_generic_depthfirst_impl(
+  const uint64_t window_cells,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const uint8_t *const *const inptrs,
+  uint8_t *outptr
+)
+{
+  if (n_valid_cells == 1 && window_cells == 1)
+  {
+    // In this case, simply copy from the input to the output
+    std::memcpy(outptr, *inptrs, n_channels);
+    return;
+  }
+
+  // Compute (or look up) the rescale values
+  int32_t shift_value = 0, rescale_value = 0;
+  if (2 <= window_cells && window_cells <= 9)
+  {
+    auto &params = rescale_params[window_cells - 2];
+    rescale_value = params.multiplier;
+    shift_value = params.shift;
+  }
+  else
+  {
+    auto f_rescale_value = 1.0f / static_cast<float>(window_cells);
+
+    shift_value = 0;
+    while (f_rescale_value < 0.5f)
+    {
+      shift_value--;
+      f_rescale_value *= 2.0f;
+    }
+
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
+    {
+      shift_value++;
+      long_rescale_value >>= 1;
+    }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
+  }
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x27, #0x0\n"
+    "cntb x26\n"
+    "cntb x25, ALL, MUL #2\n"
+    "cntb x24, ALL, MUL #3\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "whilelt p3.b, x26, %x[n_channels]\n"
+    "whilelt p2.b, x25, %x[n_channels]\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "mov z15.s, #0x0\n"
+    "mov z14.s, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "mov z13.s, #0x0\n"
+    "mov z12.s, #0x0\n"
+    "mov z11.s, #0x0\n"
+    "mov z10.s, #0x0\n"
+    "mov z9.s, #0x0\n"
+    "mov z8.s, #0x0\n"
+    "mov z7.s, #0x0\n"
+    "mov z6.s, #0x0\n"
+    "mov z5.s, #0x0\n"
+    "mov z4.s, #0x0\n"
+    "mov z3.s, #0x0\n"
+    "mov z2.s, #0x0\n"
+    "mov z1.s, #0x0\n"
+    "mov z0.s, #0x0\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 2 inputs loop
+    ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x455c0bb5  // uaddlb z21.h, z29.b, z28.b\n"
+    ".inst 0x455c0fb4  // uaddlt z20.h, z29.b, z28.b\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x455a0b73  // uaddlb z19.h, z27.b, z26.b\n"
+    ".inst 0x455a0f72  // uaddlt z18.h, z27.b, z26.b\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x45580b31  // uaddlb z17.h, z25.b, z24.b\n"
+    ".inst 0x45580f30  // uaddlt z16.h, z25.b, z24.b\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944d08  // uaddwt z8.s, z8.s, z20.h\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    ".inst 0x459348e7  // uaddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x45934cc6  // uaddwt z6.s, z6.s, z19.h\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x459248a5  // uaddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924c84  // uaddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914863  // uaddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914c42  // uaddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 2 inputs tail
+    ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
+    ".inst 0x455c0bb5  // uaddlb z21.h, z29.b, z28.b\n"
+    ".inst 0x455c0fb4  // uaddlt z20.h, z29.b, z28.b\n"
+    ".inst 0x455a0b73  // uaddlb z19.h, z27.b, z26.b\n"
+    ".inst 0x455a0f72  // uaddlt z18.h, z27.b, z26.b\n"
+    ".inst 0x45580b31  // uaddlb z17.h, z25.b, z24.b\n"
+    ".inst 0x45580f30  // uaddlt z16.h, z25.b, z24.b\n"
+    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944d08  // uaddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459348e7  // uaddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x45934cc6  // uaddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459248a5  // uaddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924c84  // uaddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914863  // uaddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914c42  // uaddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x1\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508aa17  // ushllb z23.h, z16.b, #0x0\n"
+    ".inst 0x4508ae16  // ushllt z22.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+    ".inst 0x4508aa15  // ushllb z21.h, z16.b, #0x0\n"
+    ".inst 0x4508ae14  // ushllt z20.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x4508aa13  // ushllb z19.h, z16.b, #0x0\n"
+    ".inst 0x4508ae12  // ushllt z18.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x4508aa11  // ushllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508ae10  // ushllt z16.h, z16.b, #0x0\n"
+    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944d08  // uaddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459348e7  // uaddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x45934cc6  // uaddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459248a5  // uaddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924c84  // uaddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914863  // uaddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914c42  // uaddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+    ".inst 0x04b175ef  // sqdmulh z15.s, z15.s, z17.s\n"
+    ".inst 0x04b175ce  // sqdmulh z14.s, z14.s, z17.s\n"
+    ".inst 0x04b175ad  // sqdmulh z13.s, z13.s, z17.s\n"
+    ".inst 0x04b1758c  // sqdmulh z12.s, z12.s, z17.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
+    ".inst 0x04b1756b  // sqdmulh z11.s, z11.s, z17.s\n"
+    ".inst 0x04b1754a  // sqdmulh z10.s, z10.s, z17.s\n"
+    ".inst 0x04b17529  // sqdmulh z9.s, z9.s, z17.s\n"
+    ".inst 0x04b17508  // sqdmulh z8.s, z8.s, z17.s\n"
+    ".inst 0x04b174e7  // sqdmulh z7.s, z7.s, z17.s\n"
+    ".inst 0x04b174c6  // sqdmulh z6.s, z6.s, z17.s\n"
+    ".inst 0x04b174a5  // sqdmulh z5.s, z5.s, z17.s\n"
+    ".inst 0x04b17484  // sqdmulh z4.s, z4.s, z17.s\n"
+    ".inst 0x04b17463  // sqdmulh z3.s, z3.s, z17.s\n"
+    ".inst 0x04b17442  // sqdmulh z2.s, z2.s, z17.s\n"
+    ".inst 0x04b17421  // sqdmulh z1.s, z1.s, z17.s\n"
+    ".inst 0x04b17400  // sqdmulh z0.s, z0.s, z17.s\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    ".inst 0x4482820b  // srshl z11.s, p0/M, z11.s, z16.s\n"
+    ".inst 0x4482820a  // srshl z10.s, p0/M, z10.s, z16.s\n"
+    ".inst 0x44828209  // srshl z9.s, p0/M, z9.s, z16.s\n"
+    ".inst 0x44828208  // srshl z8.s, p0/M, z8.s, z16.s\n"
+    ".inst 0x44828207  // srshl z7.s, p0/M, z7.s, z16.s\n"
+    ".inst 0x44828206  // srshl z6.s, p0/M, z6.s, z16.s\n"
+    ".inst 0x44828205  // srshl z5.s, p0/M, z5.s, z16.s\n"
+    ".inst 0x44828204  // srshl z4.s, p0/M, z4.s, z16.s\n"
+    ".inst 0x44828203  // srshl z3.s, p0/M, z3.s, z16.s\n"
+    ".inst 0x44828202  // srshl z2.s, p0/M, z2.s, z16.s\n"
+    ".inst 0x44828201  // srshl z1.s, p0/M, z1.s, z16.s\n"
+    ".inst 0x44828200  // srshl z0.s, p0/M, z0.s, z16.s\n"
+    "mov z16.s, #0x0\n"
+    "mov z19.s, #0xff\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smax z11.s, p0/M, z11.s, z16.s\n"
+    "smax z10.s, p0/M, z10.s, z16.s\n"
+    "smax z9.s, p0/M, z9.s, z16.s\n"
+    "smax z8.s, p0/M, z8.s, z16.s\n"
+    "smax z7.s, p0/M, z7.s, z16.s\n"
+    "smax z6.s, p0/M, z6.s, z16.s\n"
+    "smax z5.s, p0/M, z5.s, z16.s\n"
+    "smax z4.s, p0/M, z4.s, z16.s\n"
+    "smax z3.s, p0/M, z3.s, z16.s\n"
+    "smax z2.s, p0/M, z2.s, z16.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z19.s\n"
+    "smin z14.s, p0/M, z14.s, z19.s\n"
+    "trn1 z23.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z19.s\n"
+    "smin z12.s, p0/M, z12.s, z19.s\n"
+    "trn1 z16.h, z13.h, z12.h\n"
+    "smin z11.s, p0/M, z11.s, z19.s\n"
+    "smin z10.s, p0/M, z10.s, z19.s\n"
+    "trn1 z22.h, z11.h, z10.h\n"
+    "smin z9.s, p0/M, z9.s, z19.s\n"
+    "smin z8.s, p0/M, z8.s, z19.s\n"
+    "trn1 z18.h, z9.h, z8.h\n"
+    "smin z7.s, p0/M, z7.s, z19.s\n"
+    "smin z6.s, p0/M, z6.s, z19.s\n"
+    "trn1 z21.h, z7.h, z6.h\n"
+    "smin z5.s, p0/M, z5.s, z19.s\n"
+    "smin z4.s, p0/M, z4.s, z19.s\n"
+    "trn1 z17.h, z5.h, z4.h\n"
+    "smin z3.s, p0/M, z3.s, z19.s\n"
+    "smin z2.s, p0/M, z2.s, z19.s\n"
+    "trn1 z20.h, z3.h, z2.h\n"
+    "smin z1.s, p0/M, z1.s, z19.s\n"
+    "smin z0.s, p0/M, z0.s, z19.s\n"
+    "trn1 z19.h, z1.h, z0.h\n"
+    "trn1 z16.b, z23.b, z16.b\n"
+    "trn1 z18.b, z22.b, z18.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27, ALL, MUL #4\n"
+    "trn1 z17.b, z21.b, z17.b\n"
+    "trn1 z16.b, z20.b, z19.b\n"
+    "st1b { z18.b }, p3, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "st1b { z17.b }, p2, [%x[outptr], x25]\n"
+    "incb x25, ALL, MUL #4\n"
+    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+    "incb x24, ALL, MUL #4\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "mov z15.s, #0x0\n"
+    "mov z14.s, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "mov z13.s, #0x0\n"
+    "mov z12.s, #0x0\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 2 inputs loop
+    ".inst 0x455e0bf1  // uaddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff0  // uaddlt z16.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 2 inputs tail
+    ".inst 0x455e0bf1  // uaddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff0  // uaddlt z16.h, z31.b, z30.b\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x1\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508aa11  // ushllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508ae10  // ushllt z16.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "ld1rw { z16.s }, p0/Z, [%x[rescale_ptr]]\n"
+    ".inst 0x04b075ef  // sqdmulh z15.s, z15.s, z16.s\n"
+    ".inst 0x04b075ce  // sqdmulh z14.s, z14.s, z16.s\n"
+    ".inst 0x04b075ad  // sqdmulh z13.s, z13.s, z16.s\n"
+    ".inst 0x04b0758c  // sqdmulh z12.s, z12.s, z16.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0xff\n"
+    "smax z15.s, p0/M, z15.s, z17.s\n"
+    "smax z14.s, p0/M, z14.s, z17.s\n"
+    "smax z13.s, p0/M, z13.s, z17.s\n"
+    "smax z12.s, p0/M, z12.s, z17.s\n"
+    "smin z15.s, p0/M, z15.s, z16.s\n"
+    "smin z14.s, p0/M, z14.s, z16.s\n"
+    "trn1 z17.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z16.s\n"
+    "smin z12.s, p0/M, z12.s, z16.s\n"
+    "trn1 z16.h, z13.h, z12.h\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..3df4e4efb8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const uint8_t *const *const, uint8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<uint8_t, uint8_t>
+{
+  using Parent = DepthfirstStrategy<uint8_t, uint8_t>;
+
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
+
+  sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+  Parent::KernelType get_kernel(void) const { return sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..9088cbde89
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const bool exclude_padding,
+  const unsigned int pad_left,
+  const unsigned int pad_top,
+  const unsigned int pad_right,
+  const unsigned int pad_bottom
+)
+{
+  struct KernelArgs
+  {
+    const uint64_t n_channels;
+    const uint8_t *const *const inptrs;
+    uint8_t *const *const outptrs;
+    KernelArgs(
+      unsigned int channels,
+      const uint8_t *const *input_ptrs,
+      uint8_t *const * output_ptrs,
+      bool, unsigned int, unsigned int, unsigned int, unsigned int
+    ) : n_channels(channels),
+        inptrs(input_ptrs),
+        outptrs(output_ptrs)
+    {
+    }
+  };
+
+  const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+                        pad_left, pad_top, pad_right, pad_bottom);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x15, #0x0\n"
+    "ptrue p2.b\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "mov x14, #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_n_channels]]\n"
+    "whilelt p0.b, x15, x13\n"
+    "ldp x12, x11, [x21, #0x0]\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "ldp x28, x27, [x20, #0x0]\n"
+    "ld1b { z30.b }, p0/Z, [x27, x15]\n"
+    "ldp x26, x25, [x20, #0x10]\n"
+    "ld1b { z29.b }, p0/Z, [x25, x15]\n"
+    "ldp x24, x23, [x20, #0x20]\n"
+    "ld1b { z28.b }, p0/Z, [x24, x15]\n"
+    "ldp x22, x21, [x20, #0x30]\n"
+    "ld1b { z27.b }, p0/Z, [x21, x15]\n"
+    "ldr x20, [x20, #0x40]\n"
+    "ld1b { z26.b }, p0/Z, [x28, x15]\n"
+    "ld1b { z25.b }, p0/Z, [x26, x15]\n"
+    "ld1b { z24.b }, p0/Z, [x23, x15]\n"
+    "ld1b { z19.b }, p0/Z, [x22, x15]\n"
+    "ld1b { z23.b }, p0/Z, [x20, x15]\n"
+    "incw x15\n"
+    "whilelt p1.b, x15, x13\n"
+    "b.none 2f\n"
+    "1:"  // Vector: Loop
+    "movprfx z22, z30\n umax z22.b, p2/M, z22.b, z28.b\n"
+    "movprfx z21, z28\n umax z21.b, p2/M, z21.b, z27.b\n"
+    "ld1b { z30.b }, p1/Z, [x27, x15]\n"
+    "whilelt p0.b, x14, x13\n"
+    "movprfx z18, z29\n umax z18.b, p2/M, z18.b, z26.b\n"
+    "movprfx z17, z25\n umax z17.b, p2/M, z17.b, z24.b\n"
+    "ld1b { z28.b }, p1/Z, [x24, x15]\n"
+    "movprfx z16, z29\n umax z16.b, p2/M, z16.b, z19.b\n"
+    "movprfx z20, z24\n umax z20.b, p2/M, z20.b, z23.b\n"
+    "ld1b { z27.b }, p1/Z, [x21, x15]\n"
+    "ld1b { z29.b }, p1/Z, [x25, x15]\n"
+    "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z18.b\n"
+    "movprfx z18, z17\n umax z18.b, p2/M, z18.b, z22.b\n"
+    "ld1b { z26.b }, p1/Z, [x28, x15]\n"
+    "movprfx z17, z16\n umax z17.b, p2/M, z17.b, z21.b\n"
+    "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z20.b\n"
+    "ld1b { z25.b }, p1/Z, [x26, x15]\n"
+    "st1b { z19.b }, p0, [x12, x14]\n"
+    "ld1b { z24.b }, p1/Z, [x23, x15]\n"
+    "st1b { z18.b }, p0, [x11, x14]\n"
+    "ld1b { z19.b }, p1/Z, [x22, x15]\n"
+    "st1b { z17.b }, p0, [x10, x14]\n"
+    "ld1b { z23.b }, p1/Z, [x20, x15]\n"
+    "incw x15\n"
+    "whilelt p1.b, x15, x13\n"
+    "st1b { z16.b }, p0, [x9, x14]\n"
+    "incw x14\n"
+    "b.any 1b\n"
+    "2:"  // Vector: Tail
+    "movprfx z22, z30\n umax z22.b, p2/M, z22.b, z28.b\n"
+    "movprfx z21, z28\n umax z21.b, p2/M, z21.b, z27.b\n"
+    "whilelt p0.b, x14, x13\n"
+    "movprfx z20, z29\n umax z20.b, p2/M, z20.b, z26.b\n"
+    "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z24.b\n"
+    "movprfx z17, z29\n umax z17.b, p2/M, z17.b, z19.b\n"
+    "movprfx z19, z24\n umax z19.b, p2/M, z19.b, z23.b\n"
+    "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
+    "umax z18.b, p2/M, z18.b, z22.b\n"
+    "st1b { z16.b }, p0, [x12, x14]\n"
+    "umax z17.b, p2/M, z17.b, z21.b\n"
+    "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z19.b\n"
+    "st1b { z18.b }, p0, [x11, x14]\n"
+    "st1b { z17.b }, p0, [x10, x14]\n"
+    "st1b { z16.b }, p0, [x9, x14]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..077c8ed2f7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
+
+struct sme_u8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
+{
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
+  sme_u8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_u8_nhwc_max_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..06f13e8111
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_u8_nhwc_max_generic_depthfirst_impl(
+  const uint64_t,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const uint8_t *const *const inptrs,
+  uint8_t *outptr
+)
+{
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x9, #0x0\n"
+    "cntb x28\n"
+    "cntb x27, ALL, MUL #2\n"
+    "cntb x26, ALL, MUL #3\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "whilelt p3.b, x28, %x[n_channels]\n"
+    "whilelt p2.b, x27, %x[n_channels]\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.b, #0x0\n"
+    "mov z3.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z2.b, #0x0\n"
+    "mov z1.b, #0x0\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 4 inputs loop
+    "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
+    "umax z23.b, p0/M, z23.b, z30.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "umax z18.b, p0/M, z18.b, z29.b\n"
+    "umax z22.b, p0/M, z22.b, z28.b\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "umax z17.b, p0/M, z17.b, z27.b\n"
+    "umax z21.b, p0/M, z21.b, z26.b\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "umax z16.b, p0/M, z16.b, z25.b\n"
+    "umax z20.b, p0/M, z20.b, z24.b\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "umax z19.b, p0/M, z19.b, z23.b\n"
+    "umax z18.b, p0/M, z18.b, z22.b\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "umax z17.b, p0/M, z17.b, z21.b\n"
+    "umax z16.b, p0/M, z16.b, z20.b\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "umax z4.b, p0/M, z4.b, z19.b\n"
+    "umax z3.b, p0/M, z3.b, z18.b\n"
+    "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+    "umax z2.b, p0/M, z2.b, z17.b\n"
+    "umax z1.b, p0/M, z1.b, z16.b\n"
+    "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 4 inputs tail
+    "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
+    "umax z23.b, p0/M, z23.b, z30.b\n"
+    "umax z18.b, p0/M, z18.b, z29.b\n"
+    "umax z22.b, p0/M, z22.b, z28.b\n"
+    "umax z17.b, p0/M, z17.b, z27.b\n"
+    "umax z21.b, p0/M, z21.b, z26.b\n"
+    "umax z16.b, p0/M, z16.b, z25.b\n"
+    "umax z20.b, p0/M, z20.b, z24.b\n"
+    "umax z19.b, p0/M, z19.b, z23.b\n"
+    "umax z18.b, p0/M, z18.b, z22.b\n"
+    "umax z17.b, p0/M, z17.b, z21.b\n"
+    "umax z16.b, p0/M, z16.b, z20.b\n"
+    "umax z4.b, p0/M, z4.b, z19.b\n"
+    "umax z3.b, p0/M, z3.b, z18.b\n"
+    "umax z2.b, p0/M, z2.b, z17.b\n"
+    "umax z1.b, p0/M, z1.b, z16.b\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "umax z4.b, p0/M, z4.b, z16.b\n"
+    "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+    "umax z3.b, p0/M, z3.b, z16.b\n"
+    "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+    "umax z2.b, p0/M, z2.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+    "umax z1.b, p0/M, z1.b, z16.b\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "st1b { z4.b }, p4, [%x[outptr], x9]\n"
+    "incb x9, ALL, MUL #4\n"
+    "st1b { z3.b }, p3, [%x[outptr], x28]\n"
+    "incb x28, ALL, MUL #4\n"
+    "st1b { z2.b }, p2, [%x[outptr], x27]\n"
+    "incb x27, ALL, MUL #4\n"
+    "st1b { z1.b }, p1, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x20, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 4 inputs loop
+    "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+    "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "umax z16.b, p0/M, z16.b, z17.b\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "umax z4.b, p0/M, z4.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 4 inputs tail
+    "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+    "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+    "umax z16.b, p0/M, z16.b, z17.b\n"
+    "umax z4.b, p0/M, z4.b, z16.b\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "umax z4.b, p0/M, z4.b, z16.b\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "st1b { z4.b }, p4, [%x[outptr], x9]\n"
+    "incb x9\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..bd30a32828
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
+
+struct sme_u8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
+{
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
+  sme_u8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_u8q_nhwc_avg_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..52c52ccdb9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,489 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pooling.hpp"
+#include <cstdint>
+#include <cstddef>
+#include <cstring>
+#include <cmath>
+
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+namespace {
+  struct RescaleParams
+  {
+    int32_t multiplier, shift;
+  };
+
+  constexpr RescaleParams rescale_params[8] = {
+    {0x40000000, -0},  // 1/2
+    {0x55555556, -1},  // 1/3
+    {0x40000000, -1},  // 1/4
+    {0x66666666, -2},  // 1/5
+    {0x55555556, -2},  // 1/6
+    {0x49249249, -2},  // 1/7
+    {0x40000000, -2},  // 1/8
+    {0x71c71c72, -3},  // 1/9
+  };
+}
+
+void sme_u8q_nhwc_avg_generic_depthfirst_impl(
+  const uint64_t window_cells,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const uint8_t *const *const inptrs,
+  uint8_t *outptr,
+  const Requantize32 &qp
+)
+{
+  if (n_valid_cells == 1 && window_cells == 1)
+  {
+    // In this case, simply copy from the input to the output
+    std::memcpy(outptr, *inptrs, n_channels);
+    return;
+  }
+
+  // Compute (or look up) the rescale values
+  int32_t shift_value = 0, rescale_value = 0;
+  if (2 <= window_cells && window_cells <= 9)
+  {
+    auto &params = rescale_params[window_cells - 2];
+    rescale_value = params.multiplier;
+    shift_value = params.shift;
+  }
+  else
+  {
+    auto f_rescale_value = 1.0f / static_cast<float>(window_cells);
+
+    shift_value = 0;
+    while (f_rescale_value < 0.5f)
+    {
+      shift_value--;
+      f_rescale_value *= 2.0f;
+    }
+
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
+    {
+      shift_value++;
+      long_rescale_value >>= 1;
+    }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
+  }
+
+
+  // Initialise the accumulators such that the offsets are subtracted for all
+  // valid inputs.
+  const int32_t accumulator_init = -qp.input_offset * n_valid_cells;
+
+  // Combine together the rescale value for the requantization and the scaling
+  // factor for the average pool.
+  const int32_t shift = qp.per_layer_left_shift - qp.per_layer_right_shift + shift_value;
+  const int32_t left_shift = shift > 0 ? shift : 0;
+  const int32_t right_shift = shift <= 0 ? shift : 0;
+
+  int32_t combined_rescale_value = 0;
+  __asm__ __volatile__ (
+      "mov v16.s[0], %w[per_layer_mul]\n"
+      "mov v17.s[0], %w[rescale_value]\n"
+      "sqrdmulh s18, s16, s17\n"
+      "mov %w[combined_rescale_value], v18.s[0]\n"
+    : [combined_rescale_value] "=r" (combined_rescale_value)
+    : [per_layer_mul] "r" (qp.per_layer_mul), [rescale_value] "r" (rescale_value)
+    : "v16", "v17", "v18"
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x27, #0x0\n"
+    "cntb x26\n"
+    "cntb x25, ALL, MUL #2\n"
+    "cntb x24, ALL, MUL #3\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "whilelt p3.b, x26, %x[n_channels]\n"
+    "whilelt p2.b, x25, %x[n_channels]\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "mov z14.d, z15.d\n"
+    "mov z13.d, z15.d\n"
+    "mov z12.d, z15.d\n"
+    "mov z11.d, z15.d\n"
+    "mov x22, %x[inptrs]\n"
+    "mov z10.d, z15.d\n"
+    "mov z9.d, z15.d\n"
+    "mov z8.d, z15.d\n"
+    "mov z7.d, z15.d\n"
+    "mov z6.d, z15.d\n"
+    "mov z5.d, z15.d\n"
+    "mov z4.d, z15.d\n"
+    "mov z3.d, z15.d\n"
+    "mov z2.d, z15.d\n"
+    "mov z1.d, z15.d\n"
+    "mov z0.d, z15.d\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 2 inputs loop
+    ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x455c0bb5  // uaddlb z21.h, z29.b, z28.b\n"
+    ".inst 0x455c0fb4  // uaddlt z20.h, z29.b, z28.b\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x455a0b73  // uaddlb z19.h, z27.b, z26.b\n"
+    ".inst 0x455a0f72  // uaddlt z18.h, z27.b, z26.b\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x45580b31  // uaddlb z17.h, z25.b, z24.b\n"
+    ".inst 0x45580f30  // uaddlt z16.h, z25.b, z24.b\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944d08  // uaddwt z8.s, z8.s, z20.h\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    ".inst 0x459348e7  // uaddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x45934cc6  // uaddwt z6.s, z6.s, z19.h\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x459248a5  // uaddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924c84  // uaddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914863  // uaddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914c42  // uaddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 2 inputs tail
+    ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
+    ".inst 0x455c0bb5  // uaddlb z21.h, z29.b, z28.b\n"
+    ".inst 0x455c0fb4  // uaddlt z20.h, z29.b, z28.b\n"
+    ".inst 0x455a0b73  // uaddlb z19.h, z27.b, z26.b\n"
+    ".inst 0x455a0f72  // uaddlt z18.h, z27.b, z26.b\n"
+    ".inst 0x45580b31  // uaddlb z17.h, z25.b, z24.b\n"
+    ".inst 0x45580f30  // uaddlt z16.h, z25.b, z24.b\n"
+    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944d08  // uaddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459348e7  // uaddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x45934cc6  // uaddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459248a5  // uaddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924c84  // uaddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914863  // uaddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914c42  // uaddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x1\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508aa17  // ushllb z23.h, z16.b, #0x0\n"
+    ".inst 0x4508ae16  // ushllt z22.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+    ".inst 0x4508aa15  // ushllb z21.h, z16.b, #0x0\n"
+    ".inst 0x4508ae14  // ushllt z20.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x4508aa13  // ushllb z19.h, z16.b, #0x0\n"
+    ".inst 0x4508ae12  // ushllt z18.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x4508aa11  // ushllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508ae10  // ushllt z16.h, z16.b, #0x0\n"
+    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944d08  // uaddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459348e7  // uaddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x45934cc6  // uaddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459248a5  // uaddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924c84  // uaddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914863  // uaddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914c42  // uaddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "ld1rw { z19.s }, p0/Z, [%x[left_shift]]\n"
+    ".inst 0x4482826f  // srshl z15.s, p0/M, z15.s, z19.s\n"
+    ".inst 0x4482826e  // srshl z14.s, p0/M, z14.s, z19.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    ".inst 0x4482826d  // srshl z13.s, p0/M, z13.s, z19.s\n"
+    ".inst 0x4482826c  // srshl z12.s, p0/M, z12.s, z19.s\n"
+    "ld1rw { z18.s }, p0/Z, [%x[combined_rescale_value]]\n"
+    ".inst 0x4482826b  // srshl z11.s, p0/M, z11.s, z19.s\n"
+    ".inst 0x4482826a  // srshl z10.s, p0/M, z10.s, z19.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
+    ".inst 0x44828269  // srshl z9.s, p0/M, z9.s, z19.s\n"
+    ".inst 0x44828268  // srshl z8.s, p0/M, z8.s, z19.s\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x44828267  // srshl z7.s, p0/M, z7.s, z19.s\n"
+    ".inst 0x44828266  // srshl z6.s, p0/M, z6.s, z19.s\n"
+    ".inst 0x44828265  // srshl z5.s, p0/M, z5.s, z19.s\n"
+    ".inst 0x44828264  // srshl z4.s, p0/M, z4.s, z19.s\n"
+    ".inst 0x44828263  // srshl z3.s, p0/M, z3.s, z19.s\n"
+    ".inst 0x44828262  // srshl z2.s, p0/M, z2.s, z19.s\n"
+    ".inst 0x44828261  // srshl z1.s, p0/M, z1.s, z19.s\n"
+    ".inst 0x44828260  // srshl z0.s, p0/M, z0.s, z19.s\n"
+    ".inst 0x04b275ef  // sqrdmulh z15.s, z15.s, z18.s\n"
+    ".inst 0x04b275ce  // sqrdmulh z14.s, z14.s, z18.s\n"
+    ".inst 0x04b275ad  // sqrdmulh z13.s, z13.s, z18.s\n"
+    ".inst 0x04b2758c  // sqrdmulh z12.s, z12.s, z18.s\n"
+    ".inst 0x04b2756b  // sqrdmulh z11.s, z11.s, z18.s\n"
+    ".inst 0x04b2754a  // sqrdmulh z10.s, z10.s, z18.s\n"
+    ".inst 0x04b27529  // sqrdmulh z9.s, z9.s, z18.s\n"
+    ".inst 0x04b27508  // sqrdmulh z8.s, z8.s, z18.s\n"
+    ".inst 0x04b274e7  // sqrdmulh z7.s, z7.s, z18.s\n"
+    ".inst 0x04b274c6  // sqrdmulh z6.s, z6.s, z18.s\n"
+    ".inst 0x04b274a5  // sqrdmulh z5.s, z5.s, z18.s\n"
+    ".inst 0x04b27484  // sqrdmulh z4.s, z4.s, z18.s\n"
+    ".inst 0x04b27463  // sqrdmulh z3.s, z3.s, z18.s\n"
+    ".inst 0x04b27442  // sqrdmulh z2.s, z2.s, z18.s\n"
+    ".inst 0x04b27421  // sqrdmulh z1.s, z1.s, z18.s\n"
+    ".inst 0x04b27400  // sqrdmulh z0.s, z0.s, z18.s\n"
+    ".inst 0x4482822f  // srshl z15.s, p0/M, z15.s, z17.s\n"
+    ".inst 0x4482822e  // srshl z14.s, p0/M, z14.s, z17.s\n"
+    ".inst 0x4482822d  // srshl z13.s, p0/M, z13.s, z17.s\n"
+    ".inst 0x4482822c  // srshl z12.s, p0/M, z12.s, z17.s\n"
+    ".inst 0x4482822b  // srshl z11.s, p0/M, z11.s, z17.s\n"
+    ".inst 0x4482822a  // srshl z10.s, p0/M, z10.s, z17.s\n"
+    ".inst 0x44828229  // srshl z9.s, p0/M, z9.s, z17.s\n"
+    ".inst 0x44828228  // srshl z8.s, p0/M, z8.s, z17.s\n"
+    ".inst 0x44828227  // srshl z7.s, p0/M, z7.s, z17.s\n"
+    ".inst 0x44828226  // srshl z6.s, p0/M, z6.s, z17.s\n"
+    ".inst 0x44828225  // srshl z5.s, p0/M, z5.s, z17.s\n"
+    ".inst 0x44828224  // srshl z4.s, p0/M, z4.s, z17.s\n"
+    ".inst 0x44828223  // srshl z3.s, p0/M, z3.s, z17.s\n"
+    ".inst 0x44828222  // srshl z2.s, p0/M, z2.s, z17.s\n"
+    ".inst 0x44828221  // srshl z1.s, p0/M, z1.s, z17.s\n"
+    ".inst 0x44828220  // srshl z0.s, p0/M, z0.s, z17.s\n"
+    "add z15.s, z15.s, z16.s\n"
+    "add z14.s, z14.s, z16.s\n"
+    "add z13.s, z13.s, z16.s\n"
+    "add z12.s, z12.s, z16.s\n"
+    "add z11.s, z11.s, z16.s\n"
+    "add z10.s, z10.s, z16.s\n"
+    "add z9.s, z9.s, z16.s\n"
+    "add z8.s, z8.s, z16.s\n"
+    "add z7.s, z7.s, z16.s\n"
+    "add z6.s, z6.s, z16.s\n"
+    "add z5.s, z5.s, z16.s\n"
+    "add z4.s, z4.s, z16.s\n"
+    "add z3.s, z3.s, z16.s\n"
+    "add z2.s, z2.s, z16.s\n"
+    "add z1.s, z1.s, z16.s\n"
+    "add z0.s, z0.s, z16.s\n"
+    "mov z16.s, #0x0\n"
+    "mov z19.s, #0xff\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smax z11.s, p0/M, z11.s, z16.s\n"
+    "smax z10.s, p0/M, z10.s, z16.s\n"
+    "smax z9.s, p0/M, z9.s, z16.s\n"
+    "smax z8.s, p0/M, z8.s, z16.s\n"
+    "smax z7.s, p0/M, z7.s, z16.s\n"
+    "smax z6.s, p0/M, z6.s, z16.s\n"
+    "smax z5.s, p0/M, z5.s, z16.s\n"
+    "smax z4.s, p0/M, z4.s, z16.s\n"
+    "smax z3.s, p0/M, z3.s, z16.s\n"
+    "smax z2.s, p0/M, z2.s, z16.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z19.s\n"
+    "smin z14.s, p0/M, z14.s, z19.s\n"
+    "trn1 z23.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z19.s\n"
+    "smin z12.s, p0/M, z12.s, z19.s\n"
+    "trn1 z16.h, z13.h, z12.h\n"
+    "smin z11.s, p0/M, z11.s, z19.s\n"
+    "smin z10.s, p0/M, z10.s, z19.s\n"
+    "trn1 z22.h, z11.h, z10.h\n"
+    "smin z9.s, p0/M, z9.s, z19.s\n"
+    "smin z8.s, p0/M, z8.s, z19.s\n"
+    "trn1 z18.h, z9.h, z8.h\n"
+    "smin z7.s, p0/M, z7.s, z19.s\n"
+    "smin z6.s, p0/M, z6.s, z19.s\n"
+    "trn1 z21.h, z7.h, z6.h\n"
+    "smin z5.s, p0/M, z5.s, z19.s\n"
+    "smin z4.s, p0/M, z4.s, z19.s\n"
+    "trn1 z17.h, z5.h, z4.h\n"
+    "smin z3.s, p0/M, z3.s, z19.s\n"
+    "smin z2.s, p0/M, z2.s, z19.s\n"
+    "trn1 z20.h, z3.h, z2.h\n"
+    "smin z1.s, p0/M, z1.s, z19.s\n"
+    "smin z0.s, p0/M, z0.s, z19.s\n"
+    "trn1 z19.h, z1.h, z0.h\n"
+    "trn1 z16.b, z23.b, z16.b\n"
+    "trn1 z18.b, z22.b, z18.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27, ALL, MUL #4\n"
+    "trn1 z17.b, z21.b, z17.b\n"
+    "trn1 z16.b, z20.b, z19.b\n"
+    "st1b { z18.b }, p3, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "st1b { z17.b }, p2, [%x[outptr], x25]\n"
+    "incb x25, ALL, MUL #4\n"
+    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+    "incb x24, ALL, MUL #4\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "mov z14.d, z15.d\n"
+    "mov z13.d, z15.d\n"
+    "mov z12.d, z15.d\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 2 inputs loop
+    ".inst 0x455e0bf1  // uaddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff0  // uaddlt z16.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 2 inputs tail
+    ".inst 0x455e0bf1  // uaddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff0  // uaddlt z16.h, z31.b, z30.b\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x1\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508aa11  // ushllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508ae10  // ushllt z16.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[combined_rescale_value]]\n"
+    ".inst 0x04b075ef  // sqrdmulh z15.s, z15.s, z16.s\n"
+    ".inst 0x04b075ce  // sqrdmulh z14.s, z14.s, z16.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
+    ".inst 0x04b075ad  // sqrdmulh z13.s, z13.s, z16.s\n"
+    ".inst 0x04b0758c  // sqrdmulh z12.s, z12.s, z16.s\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x4482822f  // srshl z15.s, p0/M, z15.s, z17.s\n"
+    ".inst 0x4482822e  // srshl z14.s, p0/M, z14.s, z17.s\n"
+    ".inst 0x4482822d  // srshl z13.s, p0/M, z13.s, z17.s\n"
+    ".inst 0x4482822c  // srshl z12.s, p0/M, z12.s, z17.s\n"
+    "add z15.s, z15.s, z16.s\n"
+    "add z14.s, z14.s, z16.s\n"
+    "add z13.s, z13.s, z16.s\n"
+    "add z12.s, z12.s, z16.s\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0xff\n"
+    "smax z15.s, p0/M, z15.s, z17.s\n"
+    "smax z14.s, p0/M, z14.s, z17.s\n"
+    "smax z13.s, p0/M, z13.s, z17.s\n"
+    "smax z12.s, p0/M, z12.s, z17.s\n"
+    "smin z15.s, p0/M, z15.s, z16.s\n"
+    "smin z14.s, p0/M, z14.s, z16.s\n"
+    "trn1 z17.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z16.s\n"
+    "smin z12.s, p0/M, z12.s, z16.s\n"
+    "trn1 z16.h, z13.h, z12.h\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [accumulator_init] "r" (&accumulator_init), [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [outptr] "r" (outptr), [quant_params] "r" (&qp), [right_shift] "r" (&right_shift)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..69d627c047
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
+
+struct sme_u8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
+{
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
+  sme_u8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_u8q_nhwc_max_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c8e8e7d399
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pooling.hpp"
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_u8q_nhwc_max_generic_depthfirst_impl(
+  const uint64_t,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const uint8_t *const *const inptrs,
+  uint8_t *outptr,
+  const Requantize32 &qp
+)
+{
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x9, #0x0\n"
+    "cntb x28\n"
+    "cntb x27, ALL, MUL #2\n"
+    "cntb x26, ALL, MUL #3\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "whilelt p3.b, x28, %x[n_channels]\n"
+    "whilelt p2.b, x27, %x[n_channels]\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z5.b, #0x0\n"
+    "mov z3.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z2.b, #0x0\n"
+    "mov z1.b, #0x0\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 4 inputs loop
+    "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
+    "umax z23.b, p0/M, z23.b, z30.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "umax z18.b, p0/M, z18.b, z29.b\n"
+    "umax z22.b, p0/M, z22.b, z28.b\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "umax z17.b, p0/M, z17.b, z27.b\n"
+    "umax z21.b, p0/M, z21.b, z26.b\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "umax z16.b, p0/M, z16.b, z25.b\n"
+    "umax z20.b, p0/M, z20.b, z24.b\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "umax z19.b, p0/M, z19.b, z23.b\n"
+    "umax z18.b, p0/M, z18.b, z22.b\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "umax z17.b, p0/M, z17.b, z21.b\n"
+    "umax z16.b, p0/M, z16.b, z20.b\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "umax z5.b, p0/M, z5.b, z19.b\n"
+    "umax z3.b, p0/M, z3.b, z18.b\n"
+    "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+    "umax z2.b, p0/M, z2.b, z17.b\n"
+    "umax z1.b, p0/M, z1.b, z16.b\n"
+    "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 4 inputs tail
+    "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
+    "umax z23.b, p0/M, z23.b, z30.b\n"
+    "umax z18.b, p0/M, z18.b, z29.b\n"
+    "umax z22.b, p0/M, z22.b, z28.b\n"
+    "umax z17.b, p0/M, z17.b, z27.b\n"
+    "umax z21.b, p0/M, z21.b, z26.b\n"
+    "umax z16.b, p0/M, z16.b, z25.b\n"
+    "umax z20.b, p0/M, z20.b, z24.b\n"
+    "umax z19.b, p0/M, z19.b, z23.b\n"
+    "umax z18.b, p0/M, z18.b, z22.b\n"
+    "umax z17.b, p0/M, z17.b, z21.b\n"
+    "umax z16.b, p0/M, z16.b, z20.b\n"
+    "umax z5.b, p0/M, z5.b, z19.b\n"
+    "umax z3.b, p0/M, z3.b, z18.b\n"
+    "umax z2.b, p0/M, z2.b, z17.b\n"
+    "umax z1.b, p0/M, z1.b, z16.b\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
+    "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+    "umax z3.b, p0/M, z3.b, z16.b\n"
+    "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+    "umax z2.b, p0/M, z2.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+    "umax z1.b, p0/M, z1.b, z16.b\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+    "ld1rw { z4.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a8b7  // ushllb z23.h, z5.b, #0x0\n"
+    ".inst 0x4508acb9  // ushllt z25.h, z5.b, #0x0\n"
+    ".inst 0x4508a876  // ushllb z22.h, z3.b, #0x0\n"
+    ".inst 0x4508ac72  // ushllt z18.h, z3.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1rw { z3.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a855  // ushllb z21.h, z2.b, #0x0\n"
+    ".inst 0x4508ac51  // ushllt z17.h, z2.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1rw { z2.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a834  // ushllb z20.h, z1.b, #0x0\n"
+    ".inst 0x4508ac38  // ushllt z24.h, z1.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1rw { z19.s }, p0/Z, [x20]\n"
+    "neg z4.s, p0/M, z4.s\n"
+    ".inst 0x45974081  // saddwb z1.s, z4.s, z23.h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x45974497  // saddwt z23.s, z4.s, z23.h\n"
+    ".inst 0x45994080  // saddwb z0.s, z4.s, z25.h\n"
+    ".inst 0x4599449f  // saddwt z31.s, z4.s, z25.h\n"
+    ".inst 0x4596409e  // saddwb z30.s, z4.s, z22.h\n"
+    ".inst 0x45964496  // saddwt z22.s, z4.s, z22.h\n"
+    ".inst 0x4592409d  // saddwb z29.s, z4.s, z18.h\n"
+    ".inst 0x45924492  // saddwt z18.s, z4.s, z18.h\n"
+    ".inst 0x4595409c  // saddwb z28.s, z4.s, z21.h\n"
+    ".inst 0x45954495  // saddwt z21.s, z4.s, z21.h\n"
+    ".inst 0x4591409b  // saddwb z27.s, z4.s, z17.h\n"
+    ".inst 0x45914491  // saddwt z17.s, z4.s, z17.h\n"
+    ".inst 0x4594409a  // saddwb z26.s, z4.s, z20.h\n"
+    ".inst 0x45944494  // saddwt z20.s, z4.s, z20.h\n"
+    ".inst 0x45984099  // saddwb z25.s, z4.s, z24.h\n"
+    ".inst 0x45984498  // saddwt z24.s, z4.s, z24.h\n"
+    ".inst 0x44828061  // srshl z1.s, p0/M, z1.s, z3.s\n"
+    ".inst 0x44828077  // srshl z23.s, p0/M, z23.s, z3.s\n"
+    ".inst 0x44828060  // srshl z0.s, p0/M, z0.s, z3.s\n"
+    ".inst 0x4482807f  // srshl z31.s, p0/M, z31.s, z3.s\n"
+    ".inst 0x4482807e  // srshl z30.s, p0/M, z30.s, z3.s\n"
+    ".inst 0x44828076  // srshl z22.s, p0/M, z22.s, z3.s\n"
+    ".inst 0x4482807d  // srshl z29.s, p0/M, z29.s, z3.s\n"
+    ".inst 0x44828072  // srshl z18.s, p0/M, z18.s, z3.s\n"
+    ".inst 0x4482807c  // srshl z28.s, p0/M, z28.s, z3.s\n"
+    ".inst 0x44828075  // srshl z21.s, p0/M, z21.s, z3.s\n"
+    ".inst 0x4482807b  // srshl z27.s, p0/M, z27.s, z3.s\n"
+    ".inst 0x44828071  // srshl z17.s, p0/M, z17.s, z3.s\n"
+    ".inst 0x4482807a  // srshl z26.s, p0/M, z26.s, z3.s\n"
+    ".inst 0x44828074  // srshl z20.s, p0/M, z20.s, z3.s\n"
+    ".inst 0x44828079  // srshl z25.s, p0/M, z25.s, z3.s\n"
+    ".inst 0x44828078  // srshl z24.s, p0/M, z24.s, z3.s\n"
+    ".inst 0x04a27421  // sqrdmulh z1.s, z1.s, z2.s\n"
+    ".inst 0x04a276f7  // sqrdmulh z23.s, z23.s, z2.s\n"
+    ".inst 0x04a27400  // sqrdmulh z0.s, z0.s, z2.s\n"
+    ".inst 0x04a277ff  // sqrdmulh z31.s, z31.s, z2.s\n"
+    ".inst 0x04a277de  // sqrdmulh z30.s, z30.s, z2.s\n"
+    ".inst 0x04a276d6  // sqrdmulh z22.s, z22.s, z2.s\n"
+    ".inst 0x04a277bd  // sqrdmulh z29.s, z29.s, z2.s\n"
+    ".inst 0x04a27652  // sqrdmulh z18.s, z18.s, z2.s\n"
+    ".inst 0x04a2779c  // sqrdmulh z28.s, z28.s, z2.s\n"
+    ".inst 0x04a276b5  // sqrdmulh z21.s, z21.s, z2.s\n"
+    ".inst 0x04a2777b  // sqrdmulh z27.s, z27.s, z2.s\n"
+    ".inst 0x04a27631  // sqrdmulh z17.s, z17.s, z2.s\n"
+    ".inst 0x04a2775a  // sqrdmulh z26.s, z26.s, z2.s\n"
+    ".inst 0x04a27694  // sqrdmulh z20.s, z20.s, z2.s\n"
+    ".inst 0x04a27739  // sqrdmulh z25.s, z25.s, z2.s\n"
+    ".inst 0x04a27718  // sqrdmulh z24.s, z24.s, z2.s\n"
+    ".inst 0x44828261  // srshl z1.s, p0/M, z1.s, z19.s\n"
+    ".inst 0x44828277  // srshl z23.s, p0/M, z23.s, z19.s\n"
+    ".inst 0x44828260  // srshl z0.s, p0/M, z0.s, z19.s\n"
+    ".inst 0x4482827f  // srshl z31.s, p0/M, z31.s, z19.s\n"
+    ".inst 0x4482827e  // srshl z30.s, p0/M, z30.s, z19.s\n"
+    ".inst 0x44828276  // srshl z22.s, p0/M, z22.s, z19.s\n"
+    ".inst 0x4482827d  // srshl z29.s, p0/M, z29.s, z19.s\n"
+    ".inst 0x44828272  // srshl z18.s, p0/M, z18.s, z19.s\n"
+    ".inst 0x4482827c  // srshl z28.s, p0/M, z28.s, z19.s\n"
+    ".inst 0x44828275  // srshl z21.s, p0/M, z21.s, z19.s\n"
+    ".inst 0x4482827b  // srshl z27.s, p0/M, z27.s, z19.s\n"
+    ".inst 0x44828271  // srshl z17.s, p0/M, z17.s, z19.s\n"
+    ".inst 0x4482827a  // srshl z26.s, p0/M, z26.s, z19.s\n"
+    ".inst 0x44828274  // srshl z20.s, p0/M, z20.s, z19.s\n"
+    ".inst 0x44828279  // srshl z25.s, p0/M, z25.s, z19.s\n"
+    ".inst 0x44828278  // srshl z24.s, p0/M, z24.s, z19.s\n"
+    "add z1.s, z1.s, z16.s\n"
+    "add z23.s, z23.s, z16.s\n"
+    "add z0.s, z0.s, z16.s\n"
+    "add z31.s, z31.s, z16.s\n"
+    "add z30.s, z30.s, z16.s\n"
+    "add z22.s, z22.s, z16.s\n"
+    "add z29.s, z29.s, z16.s\n"
+    "add z18.s, z18.s, z16.s\n"
+    "add z28.s, z28.s, z16.s\n"
+    "add z21.s, z21.s, z16.s\n"
+    "add z27.s, z27.s, z16.s\n"
+    "add z17.s, z17.s, z16.s\n"
+    "add z26.s, z26.s, z16.s\n"
+    "add z20.s, z20.s, z16.s\n"
+    "add z25.s, z25.s, z16.s\n"
+    "add z24.s, z24.s, z16.s\n"
+    "mov z16.s, #0x0\n"
+    "mov z19.s, #0xff\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z23.s, p0/M, z23.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smax z31.s, p0/M, z31.s, z16.s\n"
+    "smax z30.s, p0/M, z30.s, z16.s\n"
+    "smax z22.s, p0/M, z22.s, z16.s\n"
+    "smax z29.s, p0/M, z29.s, z16.s\n"
+    "smax z18.s, p0/M, z18.s, z16.s\n"
+    "smax z28.s, p0/M, z28.s, z16.s\n"
+    "smax z21.s, p0/M, z21.s, z16.s\n"
+    "smax z27.s, p0/M, z27.s, z16.s\n"
+    "smax z17.s, p0/M, z17.s, z16.s\n"
+    "smax z26.s, p0/M, z26.s, z16.s\n"
+    "smax z20.s, p0/M, z20.s, z16.s\n"
+    "smax z25.s, p0/M, z25.s, z16.s\n"
+    "smax z24.s, p0/M, z24.s, z16.s\n"
+    "smin z1.s, p0/M, z1.s, z19.s\n"
+    "smin z23.s, p0/M, z23.s, z19.s\n"
+    "smin z0.s, p0/M, z0.s, z19.s\n"
+    "trn1 z23.h, z1.h, z23.h\n"
+    "smin z31.s, p0/M, z31.s, z19.s\n"
+    "smin z30.s, p0/M, z30.s, z19.s\n"
+    "trn1 z16.h, z0.h, z31.h\n"
+    "smin z22.s, p0/M, z22.s, z19.s\n"
+    "smin z29.s, p0/M, z29.s, z19.s\n"
+    "trn1 z22.h, z30.h, z22.h\n"
+    "smin z18.s, p0/M, z18.s, z19.s\n"
+    "smin z28.s, p0/M, z28.s, z19.s\n"
+    "trn1 z18.h, z29.h, z18.h\n"
+    "smin z21.s, p0/M, z21.s, z19.s\n"
+    "smin z27.s, p0/M, z27.s, z19.s\n"
+    "trn1 z21.h, z28.h, z21.h\n"
+    "smin z17.s, p0/M, z17.s, z19.s\n"
+    "smin z26.s, p0/M, z26.s, z19.s\n"
+    "trn1 z17.h, z27.h, z17.h\n"
+    "smin z20.s, p0/M, z20.s, z19.s\n"
+    "smin z25.s, p0/M, z25.s, z19.s\n"
+    "trn1 z20.h, z26.h, z20.h\n"
+    "smin z24.s, p0/M, z24.s, z19.s\n"
+    "trn1 z19.h, z25.h, z24.h\n"
+    "trn1 z16.b, z23.b, z16.b\n"
+    "trn1 z18.b, z22.b, z18.b\n"
+    "trn1 z17.b, z21.b, z17.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+    "incb x9, ALL, MUL #4\n"
+    "trn1 z16.b, z20.b, z19.b\n"
+    "st1b { z18.b }, p3, [%x[outptr], x28]\n"
+    "incb x28, ALL, MUL #4\n"
+    "st1b { z17.b }, p2, [%x[outptr], x27]\n"
+    "incb x27, ALL, MUL #4\n"
+    "st1b { z16.b }, p1, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z5.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x20, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 4 inputs loop
+    "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+    "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "umax z16.b, p0/M, z16.b, z17.b\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 4 inputs tail
+    "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+    "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+    "umax z16.b, p0/M, z16.b, z17.b\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+    "ld1rw { z18.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a8b1  // ushllb z17.h, z5.b, #0x0\n"
+    ".inst 0x4508acb0  // ushllt z16.h, z5.b, #0x0\n"
+    "neg z18.s, p0/M, z18.s\n"
+    ".inst 0x45914257  // saddwb z23.s, z18.s, z17.h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1rw { z22.s }, p0/Z, [x20]\n"
+    ".inst 0x45914655  // saddwt z21.s, z18.s, z17.h\n"
+    ".inst 0x45904254  // saddwb z20.s, z18.s, z16.h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1rw { z19.s }, p0/Z, [x20]\n"
+    ".inst 0x45904652  // saddwt z18.s, z18.s, z16.h\n"
+    ".inst 0x448282d7  // srshl z23.s, p0/M, z23.s, z22.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1rw { z17.s }, p0/Z, [x20]\n"
+    ".inst 0x448282d5  // srshl z21.s, p0/M, z21.s, z22.s\n"
+    ".inst 0x448282d4  // srshl z20.s, p0/M, z20.s, z22.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x448282d2  // srshl z18.s, p0/M, z18.s, z22.s\n"
+    ".inst 0x04b376f7  // sqrdmulh z23.s, z23.s, z19.s\n"
+    ".inst 0x04b376b5  // sqrdmulh z21.s, z21.s, z19.s\n"
+    ".inst 0x04b37694  // sqrdmulh z20.s, z20.s, z19.s\n"
+    ".inst 0x04b37652  // sqrdmulh z18.s, z18.s, z19.s\n"
+    ".inst 0x44828237  // srshl z23.s, p0/M, z23.s, z17.s\n"
+    ".inst 0x44828235  // srshl z21.s, p0/M, z21.s, z17.s\n"
+    ".inst 0x44828234  // srshl z20.s, p0/M, z20.s, z17.s\n"
+    ".inst 0x44828232  // srshl z18.s, p0/M, z18.s, z17.s\n"
+    "add z23.s, z23.s, z16.s\n"
+    "add z21.s, z21.s, z16.s\n"
+    "add z20.s, z20.s, z16.s\n"
+    "add z18.s, z18.s, z16.s\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0xff\n"
+    "smax z23.s, p0/M, z23.s, z17.s\n"
+    "smax z21.s, p0/M, z21.s, z17.s\n"
+    "smax z20.s, p0/M, z20.s, z17.s\n"
+    "smax z18.s, p0/M, z18.s, z17.s\n"
+    "smin z23.s, p0/M, z23.s, z16.s\n"
+    "smin z21.s, p0/M, z21.s, z16.s\n"
+    "smin z20.s, p0/M, z20.s, z16.s\n"
+    "trn1 z17.h, z23.h, z21.h\n"
+    "smin z18.s, p0/M, z18.s, z16.s\n"
+    "trn1 z16.h, z20.h, z18.h\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+    "incb x9\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_input_offset] "I" (offsetof(Requantize32, input_offset)), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [outptr] "r" (outptr), [quant_params] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index 8c7a497376..f8293233e6 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,28 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst
+struct sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
 {
-  typedef __fp16 operand_type;
-  typedef __fp16 return_type;
+  using Parent = DepthfirstStrategy<__fp16, __fp16>;
 
-  typedef void (*kern_type)(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+  const static auto pooling_type = PoolingType::AVERAGE;
+  const static auto pool_rows = 3u, pool_cols = 3u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
+  sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int pool_rows(void) { return 3; }
-  constexpr static unsigned int pool_cols(void) { return 3; }
-
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
-
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl;
-
-  sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 3c1858633b..1ba78f3fba 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include <cstddef>
 #include <cstdint>
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
 
 namespace arm_conv {
 namespace pooling {
@@ -82,126 +82,126 @@ void sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x3, [%x[args], %[offsetof_n_channels]]\n"
-    "mov x4, #0x0\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x5, #0x0\n"
-    "ldr x6, [%x[args], %[offsetof_inptrs]]\n"
-    "mov x19, #0x4\n"
-    "add x7, %x[args], %[offsetof_rescale]\n"
-    "ldp x8, x17, [x20, #0x0]\n"
-    "ldp x16, x15, [x20, #0x10]\n"
-    "whilelt p0.h, XZR, x19\n"
-    "ldp x14, x13, [x6, #0x0]\n"
-    "whilelt p1.h, x4, x3\n"
-    "ldp x12, x11, [x6, #0x10]\n"
-    "ldp x10, x9, [x6, #0x20]\n"
-    "ldp x28, x27, [x6, #0x30]\n"
-    "ldp x26, x25, [x6, #0x40]\n"
-    "ldp x24, x23, [x6, #0x50]\n"
-    "ldp x22, x21, [x6, #0x60]\n"
-    "ldp x20, x19, [x6, #0x70]\n"
-    "ld1rqh { z7.h }, p0/Z, [x7]\n"
-    "ld1h { z8.h }, p1/Z, [x9, x4, LSL #1]\n"
-    "ld1h { z6.h }, p1/Z, [x28, x4, LSL #1]\n"
-    "ld1h { z5.h }, p1/Z, [x25, x4, LSL #1]\n"
-    "ld1h { z4.h }, p1/Z, [x24, x4, LSL #1]\n"
-    "ld1h { z3.h }, p1/Z, [x13, x4, LSL #1]\n"
-    "ld1h { z2.h }, p1/Z, [x12, x4, LSL #1]\n"
-    "ld1h { z1.h }, p1/Z, [x10, x4, LSL #1]\n"
-    "ld1h { z0.h }, p1/Z, [x26, x4, LSL #1]\n"
-    "ld1h { z31.h }, p1/Z, [x27, x4, LSL #1]\n"
-    "ld1h { z30.h }, p1/Z, [x23, x4, LSL #1]\n"
-    "ld1h { z29.h }, p1/Z, [x21, x4, LSL #1]\n"
-    "ld1h { z28.h }, p1/Z, [x20, x4, LSL #1]\n"
-    "ld1h { z27.h }, p1/Z, [x14, x4, LSL #1]\n"
-    "ld1h { z26.h }, p1/Z, [x11, x4, LSL #1]\n"
-    "ld1h { z25.h }, p1/Z, [x22, x4, LSL #1]\n"
-    "ld1h { z24.h }, p1/Z, [x19, x4, LSL #1]\n"
-    "incw x4\n"
-    "whilelt p1.h, x4, x3\n"
+    "ldr x2, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "mov x3, #0x0\n"
+    "mov x20, #0x4\n"
+    "ldr x4, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x5, x6, [x21, #0x0]\n"
+    "whilelt p2.h, XZR, x20\n"
+    "whilelt p0.h, x3, x2\n"
+    "ldp x7, x8, [x21, #0x10]\n"
+    "ldp x17, x16, [x4, #0x0]\n"
+    "add x15, %x[args], %[offsetof_rescale]\n"
+    "mov x14, #0x0\n"
+    "ldp x13, x12, [x4, #0x10]\n"
+    "ldp x11, x10, [x4, #0x20]\n"
+    "ldp x9, x28, [x4, #0x30]\n"
+    "ldp x27, x26, [x4, #0x40]\n"
+    "ldp x25, x24, [x4, #0x50]\n"
+    "ldp x23, x22, [x4, #0x60]\n"
+    "ldp x21, x20, [x4, #0x70]\n"
+    "ld1h { z7.h }, p0/Z, [x10, x3, LSL #1]\n"
+    "ld1h { z6.h }, p0/Z, [x9, x3, LSL #1]\n"
+    "ld1h { z5.h }, p0/Z, [x26, x3, LSL #1]\n"
+    "ld1h { z4.h }, p0/Z, [x25, x3, LSL #1]\n"
+    "ld1h { z3.h }, p0/Z, [x16, x3, LSL #1]\n"
+    "ld1h { z2.h }, p0/Z, [x13, x3, LSL #1]\n"
+    "ld1h { z1.h }, p0/Z, [x11, x3, LSL #1]\n"
+    "ld1h { z31.h }, p0/Z, [x27, x3, LSL #1]\n"
+    "ld1h { z30.h }, p0/Z, [x28, x3, LSL #1]\n"
+    "ld1h { z29.h }, p0/Z, [x24, x3, LSL #1]\n"
+    "ld1h { z28.h }, p0/Z, [x22, x3, LSL #1]\n"
+    "ld1h { z27.h }, p0/Z, [x21, x3, LSL #1]\n"
+    "ld1h { z26.h }, p0/Z, [x17, x3, LSL #1]\n"
+    "ld1h { z25.h }, p0/Z, [x12, x3, LSL #1]\n"
+    "ld1h { z24.h }, p0/Z, [x23, x3, LSL #1]\n"
+    "ld1h { z23.h }, p0/Z, [x20, x3, LSL #1]\n"
+    "incw x3\n"
+    "whilelt p1.h, x3, x2\n"
+    "ld1rqh { z0.h }, p2/Z, [x15]\n"
     "b.none 2f\n"
     "1:"  // Vector: Loop
-    "fadd z17.h, z8.h, z6.h\n"
-    "ld1h { z8.h }, p1/Z, [x9, x4, LSL #1]\n"
-    "whilelt p0.h, x5, x3\n"
+    "fadd z17.h, z7.h, z6.h\n"
     "fadd z16.h, z5.h, z4.h\n"
-    "ld1h { z6.h }, p1/Z, [x28, x4, LSL #1]\n"
+    "ld1h { z7.h }, p1/Z, [x10, x3, LSL #1]\n"
+    "ld1h { z6.h }, p1/Z, [x9, x3, LSL #1]\n"
+    "fadd z19.h, z17.h, z16.h\n"
     "fadd z18.h, z3.h, z2.h\n"
-    "ld1h { z5.h }, p1/Z, [x25, x4, LSL #1]\n"
-    "fadd z23.h, z1.h, z0.h\n"
-    "ld1h { z4.h }, p1/Z, [x24, x4, LSL #1]\n"
-    "fadd z22.h, z31.h, z30.h\n"
-    "ld1h { z3.h }, p1/Z, [x13, x4, LSL #1]\n"
-    "fadd z17.h, z17.h, z16.h\n"
-    "ld1h { z2.h }, p1/Z, [x12, x4, LSL #1]\n"
-    "fadd z16.h, z29.h, z28.h\n"
-    "ld1h { z1.h }, p1/Z, [x10, x4, LSL #1]\n"
-    "fadd z19.h, z27.h, z23.h\n"
-    "ld1h { z0.h }, p1/Z, [x26, x4, LSL #1]\n"
-    "fadd z21.h, z18.h, z17.h\n"
-    "ld1h { z31.h }, p1/Z, [x27, x4, LSL #1]\n"
-    "fadd z20.h, z16.h, z17.h\n"
-    "ld1h { z30.h }, p1/Z, [x23, x4, LSL #1]\n"
-    "fadd z18.h, z26.h, z22.h\n"
-    "ld1h { z29.h }, p1/Z, [x21, x4, LSL #1]\n"
-    "fadd z17.h, z25.h, z23.h\n"
-    "ld1h { z28.h }, p1/Z, [x20, x4, LSL #1]\n"
-    "fadd z16.h, z24.h, z22.h\n"
-    "ld1h { z27.h }, p1/Z, [x14, x4, LSL #1]\n"
+    "ld1h { z5.h }, p1/Z, [x26, x3, LSL #1]\n"
+    "ld1h { z4.h }, p1/Z, [x25, x3, LSL #1]\n"
+    "fadd z17.h, z1.h, z31.h\n"
+    "fadd z22.h, z30.h, z29.h\n"
+    "ld1h { z3.h }, p1/Z, [x16, x3, LSL #1]\n"
+    "ld1h { z2.h }, p1/Z, [x13, x3, LSL #1]\n"
+    "fadd z16.h, z28.h, z27.h\n"
+    "fadd z21.h, z18.h, z19.h\n"
+    "ld1h { z1.h }, p1/Z, [x11, x3, LSL #1]\n"
+    "ld1h { z31.h }, p1/Z, [x27, x3, LSL #1]\n"
+    "fadd z20.h, z16.h, z19.h\n"
+    "fadd z19.h, z26.h, z17.h\n"
+    "ld1h { z30.h }, p1/Z, [x28, x3, LSL #1]\n"
+    "ld1h { z29.h }, p1/Z, [x24, x3, LSL #1]\n"
+    "fadd z18.h, z25.h, z22.h\n"
+    "fadd z17.h, z24.h, z17.h\n"
+    "ld1h { z28.h }, p1/Z, [x22, x3, LSL #1]\n"
+    "ld1h { z27.h }, p1/Z, [x21, x3, LSL #1]\n"
+    "fadd z16.h, z23.h, z22.h\n"
+    "ld1h { z26.h }, p1/Z, [x17, x3, LSL #1]\n"
+    "ld1h { z25.h }, p1/Z, [x12, x3, LSL #1]\n"
     "fadd z19.h, z21.h, z19.h\n"
-    "ld1h { z26.h }, p1/Z, [x11, x4, LSL #1]\n"
+    "ld1h { z24.h }, p1/Z, [x23, x3, LSL #1]\n"
+    "ld1h { z23.h }, p1/Z, [x20, x3, LSL #1]\n"
+    "incw x3\n"
     "fadd z18.h, z21.h, z18.h\n"
-    "ld1h { z25.h }, p1/Z, [x22, x4, LSL #1]\n"
     "fadd z17.h, z17.h, z20.h\n"
-    "ld1h { z24.h }, p1/Z, [x19, x4, LSL #1]\n"
-    "incw x4\n"
-    "fadd z16.h, z20.h, z16.h\n"
-    "whilelt p1.h, x4, x3\n"
-    "fmul z19.h, z19.h, z7.h[0]\n"
-    "st1h { z19.h }, p0, [x8, x5, LSL #1]\n"
-    "fmul z18.h, z18.h, z7.h[1]\n"
-    "fmul z17.h, z17.h, z7.h[2]\n"
-    "st1h { z18.h }, p0, [x17, x5, LSL #1]\n"
-    "fmul z16.h, z16.h, z7.h[3]\n"
-    "st1h { z17.h }, p0, [x16, x5, LSL #1]\n"
-    "st1h { z16.h }, p0, [x15, x5, LSL #1]\n"
-    "incw x5\n"
+    "fadd z16.h, z16.h, z20.h\n"
+    "whilelt p0.h, x14, x2\n"
+    "whilelt p1.h, x3, x2\n"
+    "fmul z19.h, z19.h, z0.h[0]\n"
+    "fmul z18.h, z18.h, z0.h[1]\n"
+    "st1h { z19.h }, p0, [x5, x14, LSL #1]\n"
+    "fmul z17.h, z17.h, z0.h[2]\n"
+    "fmul z16.h, z16.h, z0.h[3]\n"
+    "st1h { z18.h }, p0, [x6, x14, LSL #1]\n"
+    "st1h { z17.h }, p0, [x7, x14, LSL #1]\n"
+    "st1h { z16.h }, p0, [x8, x14, LSL #1]\n"
+    "incw x14\n"
     "b.any 1b\n"
     "2:"  // Vector: Tail
-    "fadd z17.h, z8.h, z6.h\n"
-    "whilelt p0.h, x5, x3\n"
+    "fadd z17.h, z7.h, z6.h\n"
     "fadd z16.h, z5.h, z4.h\n"
+    "whilelt p0.h, x14, x2\n"
+    "fadd z20.h, z17.h, z16.h\n"
     "fadd z18.h, z3.h, z2.h\n"
-    "fadd z23.h, z1.h, z0.h\n"
-    "fadd z17.h, z17.h, z16.h\n"
-    "fadd z22.h, z31.h, z30.h\n"
-    "fadd z16.h, z29.h, z28.h\n"
-    "fadd z21.h, z18.h, z17.h\n"
-    "fadd z19.h, z27.h, z23.h\n"
-    "fadd z20.h, z16.h, z17.h\n"
-    "fadd z18.h, z26.h, z22.h\n"
-    "fadd z17.h, z25.h, z23.h\n"
-    "fadd z16.h, z24.h, z22.h\n"
-    "fadd z19.h, z21.h, z19.h\n"
+    "fadd z17.h, z1.h, z31.h\n"
+    "fadd z19.h, z30.h, z29.h\n"
+    "fadd z16.h, z28.h, z27.h\n"
+    "fadd z21.h, z18.h, z20.h\n"
+    "fadd z20.h, z16.h, z20.h\n"
+    "fadd z16.h, z26.h, z17.h\n"
+    "fadd z18.h, z25.h, z19.h\n"
+    "fadd z17.h, z24.h, z17.h\n"
+    "fadd z19.h, z23.h, z19.h\n"
+    "fadd z16.h, z21.h, z16.h\n"
+    "fmul z16.h, z16.h, z0.h[0]\n"
+    "st1h { z16.h }, p0, [x5, x14, LSL #1]\n"
     "fadd z18.h, z21.h, z18.h\n"
     "fadd z17.h, z17.h, z20.h\n"
-    "fadd z16.h, z20.h, z16.h\n"
-    "fmul z19.h, z19.h, z7.h[0]\n"
-    "st1h { z19.h }, p0, [x8, x5, LSL #1]\n"
-    "fmul z18.h, z18.h, z7.h[1]\n"
-    "fmul z17.h, z17.h, z7.h[2]\n"
-    "st1h { z18.h }, p0, [x17, x5, LSL #1]\n"
-    "fmul z16.h, z16.h, z7.h[3]\n"
-    "st1h { z17.h }, p0, [x16, x5, LSL #1]\n"
-    "st1h { z16.h }, p0, [x15, x5, LSL #1]\n"
+    "fmul z18.h, z18.h, z0.h[1]\n"
+    "fmul z17.h, z17.h, z0.h[2]\n"
+    "fadd z16.h, z19.h, z20.h\n"
+    "fmul z16.h, z16.h, z0.h[3]\n"
+    "st1h { z18.h }, p0, [x6, x14, LSL #1]\n"
+    "st1h { z17.h }, p0, [x7, x14, LSL #1]\n"
+    "st1h { z16.h }, p0, [x8, x14, LSL #1]\n"
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
-    : "cc", "memory", "p0", "p1", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst.hpp
index 391d47cf41..49231484e6 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_fp16_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
 
-struct sve_fp16_nhwc_avg_generic_depthfirst
+struct sve_fp16_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
 {
-  typedef __fp16 operand_type;
-  typedef __fp16 return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = sve_fp16_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
   sve_fp16_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_fp16_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
index 84a6acf80d..2bef44ea5c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,9 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
 
 namespace arm_conv {
 namespace pooling {
@@ -41,88 +42,88 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
   const auto rescale_value = static_cast<__fp16>(1.0f / static_cast<float>(window_cells));
 
   __asm__ __volatile__(
+    "mov x9, #0x0\n"
+    "cnth x28\n"
+    "cnth x27, ALL, MUL #2\n"
+    "cnth x26, ALL, MUL #3\n"
     "ptrue p0.b\n"
-    "ld1rh { z8.h }, p0/Z, [%x[rescale_ptr]]\n"
-    "mov x28, #0x0\n"
-    "cnth x27\n"
-    "cnth x26, ALL, MUL #2\n"
-    "cnth x25, ALL, MUL #3\n"
-    "whilelt p3.h, x28, %x[n_channels]\n"
-    "whilelt p2.h, x27, %x[n_channels]\n"
-    "whilelt p1.h, x26, %x[n_channels]\n"
-    "whilelt p0.h, x25, %x[n_channels]\n"
+    "whilelt p3.h, x9, %x[n_channels]\n"
+    "ld1rh { z7.h }, p0/Z, [%x[rescale_ptr]]\n"
+    "whilelt p2.h, x28, %x[n_channels]\n"
+    "whilelt p1.h, x27, %x[n_channels]\n"
+    "whilelt p0.h, x26, %x[n_channels]\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
-    "mov z7.b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
     "mov z6.b, #0x0\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "mov z5.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
     "mov z4.b, #0x0\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
-    "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
-    "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
-    "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
-    "ld1h { z30.h }, p2/Z, [x22, x27, LSL #1]\n"
-    "ld1h { z29.h }, p2/Z, [x21, x27, LSL #1]\n"
-    "ld1h { z28.h }, p2/Z, [x20, x27, LSL #1]\n"
-    "ld1h { z27.h }, p1/Z, [x23, x26, LSL #1]\n"
-    "ld1h { z21.h }, p1/Z, [x22, x26, LSL #1]\n"
-    "ld1h { z26.h }, p1/Z, [x21, x26, LSL #1]\n"
-    "ld1h { z17.h }, p1/Z, [x20, x26, LSL #1]\n"
-    "ld1h { z25.h }, p0/Z, [x23, x25, LSL #1]\n"
-    "ld1h { z20.h }, p0/Z, [x22, x25, LSL #1]\n"
-    "ld1h { z24.h }, p0/Z, [x21, x25, LSL #1]\n"
-    "ld1h { z16.h }, p0/Z, [x20, x25, LSL #1]\n"
+    "mov z3.b, #0x0\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "ld1h { z30.h }, p2/Z, [x23, x28, LSL #1]\n"
+    "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
+    "ld1h { z29.h }, p2/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z28.h }, p2/Z, [x20, x28, LSL #1]\n"
+    "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+    "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+    "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+    "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+    "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "fadd z23.h, z3.h, z2.h\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fadd z19.h, z1.h, z0.h\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fadd z22.h, z31.h, z30.h\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
+    "fadd z23.h, z2.h, z1.h\n"
+    "fadd z19.h, z0.h, z31.h\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fadd z22.h, z30.h, z22.h\n"
     "fadd z18.h, z29.h, z28.h\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
     "fadd z21.h, z27.h, z21.h\n"
-    "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
     "fadd z17.h, z26.h, z17.h\n"
-    "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
     "fadd z20.h, z25.h, z20.h\n"
-    "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
     "fadd z16.h, z24.h, z16.h\n"
-    "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
     "fadd z19.h, z23.h, z19.h\n"
-    "ld1h { z30.h }, p2/Z, [x22, x27, LSL #1]\n"
     "fadd z18.h, z22.h, z18.h\n"
-    "ld1h { z29.h }, p2/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z30.h }, p2/Z, [x23, x28, LSL #1]\n"
+    "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
     "fadd z17.h, z21.h, z17.h\n"
-    "ld1h { z28.h }, p2/Z, [x20, x27, LSL #1]\n"
     "fadd z16.h, z20.h, z16.h\n"
-    "ld1h { z27.h }, p1/Z, [x23, x26, LSL #1]\n"
-    "fadd z7.h, z7.h, z19.h\n"
-    "ld1h { z21.h }, p1/Z, [x22, x26, LSL #1]\n"
-    "fadd z6.h, z6.h, z18.h\n"
-    "ld1h { z26.h }, p1/Z, [x21, x26, LSL #1]\n"
-    "fadd z5.h, z5.h, z17.h\n"
-    "ld1h { z17.h }, p1/Z, [x20, x26, LSL #1]\n"
-    "fadd z4.h, z4.h, z16.h\n"
-    "ld1h { z25.h }, p0/Z, [x23, x25, LSL #1]\n"
-    "ld1h { z20.h }, p0/Z, [x22, x25, LSL #1]\n"
-    "ld1h { z24.h }, p0/Z, [x21, x25, LSL #1]\n"
-    "ld1h { z16.h }, p0/Z, [x20, x25, LSL #1]\n"
+    "ld1h { z29.h }, p2/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z28.h }, p2/Z, [x20, x28, LSL #1]\n"
+    "fadd z6.h, z6.h, z19.h\n"
+    "fadd z5.h, z5.h, z18.h\n"
+    "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+    "fadd z4.h, z4.h, z17.h\n"
+    "fadd z3.h, z3.h, z16.h\n"
+    "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+    "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+    "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+    "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "fadd z23.h, z3.h, z2.h\n"
-    "fadd z19.h, z1.h, z0.h\n"
-    "fadd z22.h, z31.h, z30.h\n"
+    "fadd z23.h, z2.h, z1.h\n"
+    "fadd z19.h, z0.h, z31.h\n"
+    "fadd z22.h, z30.h, z22.h\n"
     "fadd z18.h, z29.h, z28.h\n"
     "fadd z21.h, z27.h, z21.h\n"
     "fadd z17.h, z26.h, z17.h\n"
@@ -132,100 +133,99 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
     "fadd z18.h, z22.h, z18.h\n"
     "fadd z17.h, z21.h, z17.h\n"
     "fadd z16.h, z20.h, z16.h\n"
-    "fadd z7.h, z7.h, z19.h\n"
-    "fadd z6.h, z6.h, z18.h\n"
-    "fadd z5.h, z5.h, z17.h\n"
-    "fadd z4.h, z4.h, z16.h\n"
+    "fadd z6.h, z6.h, z19.h\n"
+    "fadd z5.h, z5.h, z18.h\n"
+    "fadd z4.h, z4.h, z17.h\n"
+    "fadd z3.h, z3.h, z16.h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "fadd z7.h, z7.h, z3.h\n"
-    "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
-    "ld1h { z27.h }, p1/Z, [x23, x26, LSL #1]\n"
-    "fadd z6.h, z6.h, z31.h\n"
-    "ld1h { z25.h }, p0/Z, [x23, x25, LSL #1]\n"
-    "fadd z5.h, z5.h, z27.h\n"
-    "fadd z4.h, z4.h, z25.h\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd z6.h, z6.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x28, LSL #1]\n"
+    "ld1h { z16.h }, p1/Z, [x20, x27, LSL #1]\n"
+    "fadd z5.h, z5.h, z17.h\n"
+    "fadd z4.h, z4.h, z16.h\n"
+    "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+    "fadd z3.h, z3.h, z16.h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "fmul z7.h, z7.h, z8.h\n"
-    "st1h { z7.h }, p3, [%x[outptr], x28, LSL #1]\n"
-    "fmul z6.h, z6.h, z8.h\n"
+    "fmul z6.h, z6.h, z7.h\n"
+    "fmul z5.h, z5.h, z7.h\n"
+    "st1h { z6.h }, p3, [%x[outptr], x9, LSL #1]\n"
+    "fmul z4.h, z4.h, z7.h\n"
+    "fmul z3.h, z3.h, z7.h\n"
+    "st1h { z5.h }, p2, [%x[outptr], x28, LSL #1]\n"
+    "st1h { z4.h }, p1, [%x[outptr], x27, LSL #1]\n"
+    "inch x9, ALL, MUL #4\n"
     "inch x28, ALL, MUL #4\n"
-    "fmul z5.h, z5.h, z8.h\n"
-    "st1h { z6.h }, p2, [%x[outptr], x27, LSL #1]\n"
-    "fmul z4.h, z4.h, z8.h\n"
-    "inch x27, ALL, MUL #4\n"
-    "st1h { z5.h }, p1, [%x[outptr], x26, LSL #1]\n"
+    "st1h { z3.h }, p0, [%x[outptr], x26, LSL #1]\n"
     "inch x26, ALL, MUL #4\n"
-    "st1h { z4.h }, p0, [%x[outptr], x25, LSL #1]\n"
-    "inch x25, ALL, MUL #4\n"
-    "whilelt p0.h, x25, %x[n_channels]\n"
+    "whilelt p0.h, x26, %x[n_channels]\n"
+    "inch x27, ALL, MUL #4\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.h, x28, %x[n_channels]\n"
+    "whilelt p3.h, x9, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov z7.b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
-    "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
-    "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z6.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "fadd z23.h, z3.h, z2.h\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fadd z19.h, z1.h, z0.h\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fadd z19.h, z23.h, z19.h\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
-    "fadd z7.h, z7.h, z19.h\n"
-    "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
-    "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "fadd z17.h, z2.h, z1.h\n"
+    "fadd z16.h, z0.h, z31.h\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fadd z16.h, z17.h, z16.h\n"
+    "subs x25, x25, #0x1\n"
+    "fadd z6.h, z6.h, z16.h\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "fadd z23.h, z3.h, z2.h\n"
-    "fadd z19.h, z1.h, z0.h\n"
-    "fadd z19.h, z23.h, z19.h\n"
-    "fadd z7.h, z7.h, z19.h\n"
+    "fadd z17.h, z2.h, z1.h\n"
+    "fadd z16.h, z0.h, z31.h\n"
+    "fadd z16.h, z17.h, z16.h\n"
+    "fadd z6.h, z6.h, z16.h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "fadd z7.h, z7.h, z3.h\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd z6.h, z6.h, z16.h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "fmul z7.h, z7.h, z8.h\n"
-    "st1h { z7.h }, p3, [%x[outptr], x28, LSL #1]\n"
-    "inch x28\n"
-    "whilelt p3.h, x28, %x[n_channels]\n"
+    "fmul z6.h, z6.h, z7.h\n"
+    "st1h { z6.h }, p3, [%x[outptr], x9, LSL #1]\n"
+    "inch x9\n"
+    "whilelt p3.h, x9, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 5fb297eb49..3691b6cb28 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,28 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst
+struct sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
 {
-  typedef __fp16 operand_type;
-  typedef __fp16 return_type;
+  using Parent = DepthfirstStrategy<__fp16, __fp16>;
 
-  typedef void (*kern_type)(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+  sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int pool_rows(void) { return 2; }
-  constexpr static unsigned int pool_cols(void) { return 2; }
-
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
-
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
-  sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index f6e23215b8..31bbfd085e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include <cstddef>
 #include <cstdint>
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
 
 namespace arm_conv {
 namespace pooling {
@@ -63,84 +63,84 @@ void sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x14, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "mov x14, #0x0\n"
+    "whilelt p0.h, x14, x15\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x13, x12, [x21, #0x0]\n"
     "ptrue p2.b\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x13, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "mov x12, #0x0\n"
-    "ldp x11, x10, [x20, #0x0]\n"
-    "whilelt p1.h, x13, x14\n"
-    "ldp x9, x28, [x20, #0x10]\n"
-    "ldp x27, x26, [x19, #0x0]\n"
-    "ldp x25, x24, [x19, #0x10]\n"
-    "ldp x23, x22, [x19, #0x20]\n"
-    "ldp x21, x20, [x19, #0x30]\n"
-    "ldr x19, [x19, #0x40]\n"
-    "ld1h { z31.h }, p1/Z, [x26, x13, LSL #1]\n"
-    "ld1h { z30.h }, p1/Z, [x23, x13, LSL #1]\n"
-    "ld1h { z29.h }, p1/Z, [x20, x13, LSL #1]\n"
-    "ld1h { z28.h }, p1/Z, [x24, x13, LSL #1]\n"
-    "ld1h { z27.h }, p1/Z, [x27, x13, LSL #1]\n"
-    "ld1h { z26.h }, p1/Z, [x22, x13, LSL #1]\n"
-    "ld1h { z25.h }, p1/Z, [x25, x13, LSL #1]\n"
-    "ld1h { z24.h }, p1/Z, [x21, x13, LSL #1]\n"
-    "ld1h { z23.h }, p1/Z, [x19, x13, LSL #1]\n"
-    "incw x13\n"
-    "whilelt p1.h, x13, x14\n"
+    "mov x11, #0x0\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "ldp x28, x27, [x20, #0x0]\n"
+    "ldp x26, x25, [x20, #0x10]\n"
+    "ldp x24, x23, [x20, #0x20]\n"
+    "ldp x22, x21, [x20, #0x30]\n"
+    "ldr x20, [x20, #0x40]\n"
+    "ld1h { z31.h }, p0/Z, [x27, x14, LSL #1]\n"
+    "ld1h { z30.h }, p0/Z, [x24, x14, LSL #1]\n"
+    "ld1h { z29.h }, p0/Z, [x21, x14, LSL #1]\n"
+    "ld1h { z28.h }, p0/Z, [x25, x14, LSL #1]\n"
+    "ld1h { z27.h }, p0/Z, [x28, x14, LSL #1]\n"
+    "ld1h { z26.h }, p0/Z, [x26, x14, LSL #1]\n"
+    "ld1h { z25.h }, p0/Z, [x23, x14, LSL #1]\n"
+    "ld1h { z24.h }, p0/Z, [x22, x14, LSL #1]\n"
+    "ld1h { z23.h }, p0/Z, [x20, x14, LSL #1]\n"
+    "incw x14\n"
+    "whilelt p1.h, x14, x15\n"
     "b.none 2f\n"
     "1:"  // Vector: Loop
     "movprfx z22, z31\n fmax z22.h, p2/M, z22.h, z30.h\n"
-    "ld1h { z31.h }, p1/Z, [x26, x13, LSL #1]\n"
-    "whilelt p0.h, x12, x14\n"
     "movprfx z21, z30\n fmax z21.h, p2/M, z21.h, z29.h\n"
-    "ld1h { z30.h }, p1/Z, [x23, x13, LSL #1]\n"
-    "movprfx z18, z28\n fmax z18.h, p2/M, z18.h, z27.h\n"
-    "ld1h { z29.h }, p1/Z, [x20, x13, LSL #1]\n"
-    "movprfx z17, z26\n fmax z17.h, p2/M, z17.h, z25.h\n"
-    "ld1h { z27.h }, p1/Z, [x27, x13, LSL #1]\n"
-    "movprfx z16, z24\n fmax z16.h, p2/M, z16.h, z28.h\n"
-    "ld1h { z28.h }, p1/Z, [x24, x13, LSL #1]\n"
-    "movprfx z20, z26\n fmax z20.h, p2/M, z20.h, z23.h\n"
-    "ld1h { z26.h }, p1/Z, [x22, x13, LSL #1]\n"
-    "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z18.h\n"
-    "ld1h { z25.h }, p1/Z, [x25, x13, LSL #1]\n"
-    "movprfx z18, z22\n fmax z18.h, p2/M, z18.h, z17.h\n"
-    "ld1h { z24.h }, p1/Z, [x21, x13, LSL #1]\n"
-    "movprfx z17, z21\n fmax z17.h, p2/M, z17.h, z16.h\n"
-    "ld1h { z23.h }, p1/Z, [x19, x13, LSL #1]\n"
-    "incw x13\n"
-    "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z20.h\n"
-    "st1h { z19.h }, p0, [x11, x12, LSL #1]\n"
-    "whilelt p1.h, x13, x14\n"
-    "st1h { z18.h }, p0, [x10, x12, LSL #1]\n"
-    "st1h { z17.h }, p0, [x9, x12, LSL #1]\n"
-    "st1h { z16.h }, p0, [x28, x12, LSL #1]\n"
-    "incw x12\n"
+    "ld1h { z31.h }, p1/Z, [x27, x14, LSL #1]\n"
+    "ld1h { z30.h }, p1/Z, [x24, x14, LSL #1]\n"
+    "movprfx z20, z28\n fmax z20.h, p2/M, z20.h, z27.h\n"
+    "movprfx z19, z26\n fmax z19.h, p2/M, z19.h, z25.h\n"
+    "ld1h { z29.h }, p1/Z, [x21, x14, LSL #1]\n"
+    "ld1h { z27.h }, p1/Z, [x28, x14, LSL #1]\n"
+    "movprfx z17, z28\n fmax z17.h, p2/M, z17.h, z24.h\n"
+    "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z23.h\n"
+    "ld1h { z28.h }, p1/Z, [x25, x14, LSL #1]\n"
+    "ld1h { z26.h }, p1/Z, [x26, x14, LSL #1]\n"
+    "ld1h { z25.h }, p1/Z, [x23, x14, LSL #1]\n"
+    "ld1h { z24.h }, p1/Z, [x22, x14, LSL #1]\n"
+    "whilelt p0.h, x11, x15\n"
+    "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
+    "ld1h { z23.h }, p1/Z, [x20, x14, LSL #1]\n"
+    "incw x14\n"
+    "whilelt p1.h, x14, x15\n"
+    "st1h { z16.h }, p0, [x13, x11, LSL #1]\n"
+    "movprfx z16, z19\n fmax z16.h, p2/M, z16.h, z22.h\n"
+    "fmax z17.h, p2/M, z17.h, z21.h\n"
+    "st1h { z16.h }, p0, [x12, x11, LSL #1]\n"
+    "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z18.h\n"
+    "st1h { z17.h }, p0, [x10, x11, LSL #1]\n"
+    "st1h { z16.h }, p0, [x9, x11, LSL #1]\n"
+    "incw x11\n"
     "b.any 1b\n"
     "2:"  // Vector: Tail
     "movprfx z22, z31\n fmax z22.h, p2/M, z22.h, z30.h\n"
-    "whilelt p0.h, x12, x14\n"
     "movprfx z21, z30\n fmax z21.h, p2/M, z21.h, z29.h\n"
-    "movprfx z18, z28\n fmax z18.h, p2/M, z18.h, z27.h\n"
-    "movprfx z17, z26\n fmax z17.h, p2/M, z17.h, z25.h\n"
-    "movprfx z16, z24\n fmax z16.h, p2/M, z16.h, z28.h\n"
-    "movprfx z20, z26\n fmax z20.h, p2/M, z20.h, z23.h\n"
-    "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z18.h\n"
-    "st1h { z19.h }, p0, [x11, x12, LSL #1]\n"
-    "movprfx z18, z22\n fmax z18.h, p2/M, z18.h, z17.h\n"
-    "movprfx z17, z21\n fmax z17.h, p2/M, z17.h, z16.h\n"
-    "st1h { z18.h }, p0, [x10, x12, LSL #1]\n"
-    "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z20.h\n"
-    "st1h { z17.h }, p0, [x9, x12, LSL #1]\n"
-    "st1h { z16.h }, p0, [x28, x12, LSL #1]\n"
+    "movprfx z20, z28\n fmax z20.h, p2/M, z20.h, z27.h\n"
+    "movprfx z19, z26\n fmax z19.h, p2/M, z19.h, z25.h\n"
+    "movprfx z17, z28\n fmax z17.h, p2/M, z17.h, z24.h\n"
+    "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z23.h\n"
+    "whilelt p0.h, x11, x15\n"
+    "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
+    "st1h { z16.h }, p0, [x13, x11, LSL #1]\n"
+    "movprfx z16, z19\n fmax z16.h, p2/M, z16.h, z22.h\n"
+    "fmax z17.h, p2/M, z17.h, z21.h\n"
+    "st1h { z16.h }, p0, [x12, x11, LSL #1]\n"
+    "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z18.h\n"
+    "st1h { z17.h }, p0, [x10, x11, LSL #1]\n"
+    "st1h { z16.h }, p0, [x9, x11, LSL #1]\n"
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
-    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst.hpp
index 1c17c27619..0ef0a793cc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_fp16_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
 
-struct sve_fp16_nhwc_max_generic_depthfirst
+struct sve_fp16_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
 {
-  typedef __fp16 operand_type;
-  typedef __fp16 return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = sve_fp16_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
   sve_fp16_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_fp16_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
index 58ab915605..1a01412836 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,9 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
 
 namespace arm_conv {
 namespace pooling {
@@ -39,185 +40,184 @@ void sve_fp16_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x28, #0x0\n"
-    "cnth x27\n"
-    "cnth x26, ALL, MUL #2\n"
-    "cnth x25, ALL, MUL #3\n"
+    "mov x9, #0x0\n"
+    "cnth x28\n"
+    "cnth x27, ALL, MUL #2\n"
+    "cnth x26, ALL, MUL #3\n"
+    "whilelt p4.h, x9, %x[n_channels]\n"
     "whilelt p3.h, x28, %x[n_channels]\n"
     "whilelt p2.h, x27, %x[n_channels]\n"
     "whilelt p1.h, x26, %x[n_channels]\n"
-    "whilelt p0.h, x25, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.h, #0xfc00\n"
     "mov z7.h, #0xfc00\n"
-    "mov x19, %x[inptrs]\n"
+    "mov x24, %x[inptrs]\n"
     "mov z6.h, #0xfc00\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "mov z5.h, #0xfc00\n"
-    "mov z4.h, #0xfc00\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
-    "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
-    "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
-    "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
-    "ld1h { z30.h }, p2/Z, [x22, x27, LSL #1]\n"
-    "ld1h { z22.h }, p2/Z, [x21, x27, LSL #1]\n"
-    "ld1h { z29.h }, p2/Z, [x20, x27, LSL #1]\n"
-    "ld1h { z28.h }, p1/Z, [x23, x26, LSL #1]\n"
-    "ld1h { z27.h }, p1/Z, [x22, x26, LSL #1]\n"
-    "ld1h { z21.h }, p1/Z, [x21, x26, LSL #1]\n"
-    "ld1h { z26.h }, p1/Z, [x20, x26, LSL #1]\n"
-    "ld1h { z16.h }, p0/Z, [x23, x25, LSL #1]\n"
-    "ld1h { z25.h }, p0/Z, [x22, x25, LSL #1]\n"
-    "ld1h { z20.h }, p0/Z, [x21, x25, LSL #1]\n"
-    "ld1h { z24.h }, p0/Z, [x20, x25, LSL #1]\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "ld1h { z0.h }, p3/Z, [x23, x28, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x22, x28, LSL #1]\n"
+    "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z30.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "ld1h { z29.h }, p2/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z28.h }, p2/Z, [x22, x27, LSL #1]\n"
+    "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z27.h }, p2/Z, [x20, x27, LSL #1]\n"
+    "ld1h { z26.h }, p1/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+    "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+    "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "movprfx z19, z3\n fmax z19.h, p4/M, z19.h, z2.h\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n fmax z23.h, p4/M, z23.h, z0.h\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "movprfx z18, z31\n fmax z18.h, p4/M, z18.h, z30.h\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "fmax z22.h, p4/M, z22.h, z29.h\n"
-    "movprfx z17, z28\n fmax z17.h, p4/M, z17.h, z27.h\n"
-    "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
-    "fmax z21.h, p4/M, z21.h, z26.h\n"
-    "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
-    "fmax z16.h, p4/M, z16.h, z25.h\n"
-    "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
-    "fmax z20.h, p4/M, z20.h, z24.h\n"
-    "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
-    "fmax z19.h, p4/M, z19.h, z23.h\n"
-    "ld1h { z30.h }, p2/Z, [x22, x27, LSL #1]\n"
-    "fmax z18.h, p4/M, z18.h, z22.h\n"
-    "ld1h { z22.h }, p2/Z, [x21, x27, LSL #1]\n"
-    "fmax z17.h, p4/M, z17.h, z21.h\n"
-    "ld1h { z29.h }, p2/Z, [x20, x27, LSL #1]\n"
-    "fmax z16.h, p4/M, z16.h, z20.h\n"
-    "ld1h { z28.h }, p1/Z, [x23, x26, LSL #1]\n"
-    "fmax z7.h, p4/M, z7.h, z19.h\n"
-    "ld1h { z27.h }, p1/Z, [x22, x26, LSL #1]\n"
-    "fmax z6.h, p4/M, z6.h, z18.h\n"
-    "ld1h { z21.h }, p1/Z, [x21, x26, LSL #1]\n"
-    "fmax z5.h, p4/M, z5.h, z17.h\n"
-    "ld1h { z26.h }, p1/Z, [x20, x26, LSL #1]\n"
-    "fmax z4.h, p4/M, z4.h, z16.h\n"
-    "ld1h { z16.h }, p0/Z, [x23, x25, LSL #1]\n"
-    "ld1h { z25.h }, p0/Z, [x22, x25, LSL #1]\n"
-    "ld1h { z20.h }, p0/Z, [x21, x25, LSL #1]\n"
-    "ld1h { z24.h }, p0/Z, [x20, x25, LSL #1]\n"
+    "movprfx z19, z4\n fmax z19.h, p0/M, z19.h, z3.h\n"
+    "movprfx z23, z2\n fmax z23.h, p0/M, z23.h, z1.h\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "movprfx z18, z0\n fmax z18.h, p0/M, z18.h, z31.h\n"
+    "fmax z22.h, p0/M, z22.h, z30.h\n"
+    "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+    "movprfx z17, z29\n fmax z17.h, p0/M, z17.h, z28.h\n"
+    "fmax z21.h, p0/M, z21.h, z27.h\n"
+    "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "movprfx z16, z26\n fmax z16.h, p0/M, z16.h, z25.h\n"
+    "fmax z20.h, p0/M, z20.h, z24.h\n"
+    "ld1h { z0.h }, p3/Z, [x23, x28, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x22, x28, LSL #1]\n"
+    "fmax z19.h, p0/M, z19.h, z23.h\n"
+    "fmax z18.h, p0/M, z18.h, z22.h\n"
+    "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z30.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "fmax z17.h, p0/M, z17.h, z21.h\n"
+    "fmax z16.h, p0/M, z16.h, z20.h\n"
+    "ld1h { z29.h }, p2/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z28.h }, p2/Z, [x22, x27, LSL #1]\n"
+    "subs x25, x25, #0x1\n"
+    "fmax z8.h, p0/M, z8.h, z19.h\n"
+    "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z27.h }, p2/Z, [x20, x27, LSL #1]\n"
+    "fmax z7.h, p0/M, z7.h, z18.h\n"
+    "fmax z6.h, p0/M, z6.h, z17.h\n"
+    "ld1h { z26.h }, p1/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+    "fmax z5.h, p0/M, z5.h, z16.h\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+    "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "movprfx z19, z3\n fmax z19.h, p4/M, z19.h, z2.h\n"
-    "movprfx z23, z1\n fmax z23.h, p4/M, z23.h, z0.h\n"
-    "movprfx z18, z31\n fmax z18.h, p4/M, z18.h, z30.h\n"
-    "fmax z22.h, p4/M, z22.h, z29.h\n"
-    "movprfx z17, z28\n fmax z17.h, p4/M, z17.h, z27.h\n"
-    "fmax z21.h, p4/M, z21.h, z26.h\n"
-    "fmax z16.h, p4/M, z16.h, z25.h\n"
-    "fmax z20.h, p4/M, z20.h, z24.h\n"
-    "fmax z19.h, p4/M, z19.h, z23.h\n"
-    "fmax z18.h, p4/M, z18.h, z22.h\n"
-    "fmax z17.h, p4/M, z17.h, z21.h\n"
-    "fmax z16.h, p4/M, z16.h, z20.h\n"
-    "fmax z7.h, p4/M, z7.h, z19.h\n"
-    "fmax z6.h, p4/M, z6.h, z18.h\n"
-    "fmax z5.h, p4/M, z5.h, z17.h\n"
-    "fmax z4.h, p4/M, z4.h, z16.h\n"
+    "movprfx z19, z4\n fmax z19.h, p0/M, z19.h, z3.h\n"
+    "movprfx z23, z2\n fmax z23.h, p0/M, z23.h, z1.h\n"
+    "movprfx z18, z0\n fmax z18.h, p0/M, z18.h, z31.h\n"
+    "fmax z22.h, p0/M, z22.h, z30.h\n"
+    "movprfx z17, z29\n fmax z17.h, p0/M, z17.h, z28.h\n"
+    "fmax z21.h, p0/M, z21.h, z27.h\n"
+    "movprfx z16, z26\n fmax z16.h, p0/M, z16.h, z25.h\n"
+    "fmax z20.h, p0/M, z20.h, z24.h\n"
+    "fmax z19.h, p0/M, z19.h, z23.h\n"
+    "fmax z18.h, p0/M, z18.h, z22.h\n"
+    "fmax z17.h, p0/M, z17.h, z21.h\n"
+    "fmax z16.h, p0/M, z16.h, z20.h\n"
+    "fmax z8.h, p0/M, z8.h, z19.h\n"
+    "fmax z7.h, p0/M, z7.h, z18.h\n"
+    "fmax z6.h, p0/M, z6.h, z17.h\n"
+    "fmax z5.h, p0/M, z5.h, z16.h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "fmax z7.h, p4/M, z7.h, z3.h\n"
-    "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
-    "ld1h { z28.h }, p1/Z, [x23, x26, LSL #1]\n"
-    "fmax z6.h, p4/M, z6.h, z31.h\n"
-    "ld1h { z16.h }, p0/Z, [x23, x25, LSL #1]\n"
-    "fmax z5.h, p4/M, z5.h, z28.h\n"
-    "fmax z4.h, p4/M, z4.h, z16.h\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax z8.h, p0/M, z8.h, z16.h\n"
+    "ld1h { z17.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x20, x27, LSL #1]\n"
+    "fmax z7.h, p0/M, z7.h, z17.h\n"
+    "fmax z6.h, p0/M, z6.h, z16.h\n"
+    "ld1h { z16.h }, p1/Z, [x20, x26, LSL #1]\n"
+    "fmax z5.h, p0/M, z5.h, z16.h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
+    "st1h { z8.h }, p4, [%x[outptr], x9, LSL #1]\n"
+    "inch x9, ALL, MUL #4\n"
     "st1h { z7.h }, p3, [%x[outptr], x28, LSL #1]\n"
     "inch x28, ALL, MUL #4\n"
     "st1h { z6.h }, p2, [%x[outptr], x27, LSL #1]\n"
     "inch x27, ALL, MUL #4\n"
     "st1h { z5.h }, p1, [%x[outptr], x26, LSL #1]\n"
     "inch x26, ALL, MUL #4\n"
-    "st1h { z4.h }, p0, [%x[outptr], x25, LSL #1]\n"
-    "inch x25, ALL, MUL #4\n"
-    "whilelt p0.h, x25, %x[n_channels]\n"
+    "whilelt p1.h, x26, %x[n_channels]\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.h, x28, %x[n_channels]\n"
+    "whilelt p4.h, x9, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov z7.h, #0xfc00\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
-    "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
-    "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.h, #0xfc00\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "movprfx z19, z3\n fmax z19.h, p4/M, z19.h, z2.h\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n fmax z23.h, p4/M, z23.h, z0.h\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fmax z19.h, p4/M, z19.h, z23.h\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
-    "fmax z7.h, p4/M, z7.h, z19.h\n"
-    "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
-    "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "movprfx z16, z4\n fmax z16.h, p0/M, z16.h, z3.h\n"
+    "movprfx z17, z2\n fmax z17.h, p0/M, z17.h, z1.h\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fmax z16.h, p0/M, z16.h, z17.h\n"
+    "subs x25, x25, #0x1\n"
+    "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+    "fmax z8.h, p0/M, z8.h, z16.h\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "movprfx z19, z3\n fmax z19.h, p4/M, z19.h, z2.h\n"
-    "movprfx z23, z1\n fmax z23.h, p4/M, z23.h, z0.h\n"
-    "fmax z19.h, p4/M, z19.h, z23.h\n"
-    "fmax z7.h, p4/M, z7.h, z19.h\n"
+    "movprfx z16, z4\n fmax z16.h, p0/M, z16.h, z3.h\n"
+    "movprfx z17, z2\n fmax z17.h, p0/M, z17.h, z1.h\n"
+    "fmax z16.h, p0/M, z16.h, z17.h\n"
+    "fmax z8.h, p0/M, z8.h, z16.h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "fmax z7.h, p4/M, z7.h, z3.h\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax z8.h, p0/M, z8.h, z16.h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "st1h { z7.h }, p3, [%x[outptr], x28, LSL #1]\n"
-    "inch x28\n"
-    "whilelt p3.h, x28, %x[n_channels]\n"
+    "st1h { z8.h }, p4, [%x[outptr], x9, LSL #1]\n"
+    "inch x9\n"
+    "whilelt p4.h, x9, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index 9cbdb8a58d..d5578d617f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,28 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst
+struct sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
 {
-  typedef float operand_type;
-  typedef float return_type;
+  using Parent = DepthfirstStrategy<float, float>;
 
-  typedef void (*kern_type)(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+  const static auto pooling_type = PoolingType::AVERAGE;
+  const static auto pool_rows = 3u, pool_cols = 3u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
+  sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int pool_rows(void) { return 3; }
-  constexpr static unsigned int pool_cols(void) { return 3; }
-
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
-
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl;
-
-  sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 50f5da4c3d..c5ea5adea0 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include <cstddef>
 #include <cstdint>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -82,126 +82,126 @@ void sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x3, [%x[args], %[offsetof_n_channels]]\n"
-    "mov x4, #0x0\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x5, #0x0\n"
-    "ldr x6, [%x[args], %[offsetof_inptrs]]\n"
-    "mov x19, #0x4\n"
-    "add x7, %x[args], %[offsetof_rescale]\n"
-    "ldp x8, x17, [x20, #0x0]\n"
-    "ldp x16, x15, [x20, #0x10]\n"
-    "whilelt p0.s, XZR, x19\n"
-    "ldp x14, x13, [x6, #0x0]\n"
-    "whilelt p1.s, x4, x3\n"
-    "ldp x12, x11, [x6, #0x10]\n"
-    "ldp x10, x9, [x6, #0x20]\n"
-    "ldp x28, x27, [x6, #0x30]\n"
-    "ldp x26, x25, [x6, #0x40]\n"
-    "ldp x24, x23, [x6, #0x50]\n"
-    "ldp x22, x21, [x6, #0x60]\n"
-    "ldp x20, x19, [x6, #0x70]\n"
-    "ld1rqw { z7.s }, p0/Z, [x7]\n"
-    "ld1w { z8.s }, p1/Z, [x9, x4, LSL #2]\n"
-    "ld1w { z6.s }, p1/Z, [x28, x4, LSL #2]\n"
-    "ld1w { z5.s }, p1/Z, [x25, x4, LSL #2]\n"
-    "ld1w { z4.s }, p1/Z, [x24, x4, LSL #2]\n"
-    "ld1w { z3.s }, p1/Z, [x13, x4, LSL #2]\n"
-    "ld1w { z2.s }, p1/Z, [x12, x4, LSL #2]\n"
-    "ld1w { z1.s }, p1/Z, [x10, x4, LSL #2]\n"
-    "ld1w { z0.s }, p1/Z, [x26, x4, LSL #2]\n"
-    "ld1w { z31.s }, p1/Z, [x27, x4, LSL #2]\n"
-    "ld1w { z30.s }, p1/Z, [x23, x4, LSL #2]\n"
-    "ld1w { z29.s }, p1/Z, [x21, x4, LSL #2]\n"
-    "ld1w { z28.s }, p1/Z, [x20, x4, LSL #2]\n"
-    "ld1w { z27.s }, p1/Z, [x14, x4, LSL #2]\n"
-    "ld1w { z26.s }, p1/Z, [x11, x4, LSL #2]\n"
-    "ld1w { z25.s }, p1/Z, [x22, x4, LSL #2]\n"
-    "ld1w { z24.s }, p1/Z, [x19, x4, LSL #2]\n"
-    "incw x4\n"
-    "whilelt p1.s, x4, x3\n"
+    "ldr x2, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "mov x3, #0x0\n"
+    "mov x20, #0x4\n"
+    "ldr x4, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x5, x6, [x21, #0x0]\n"
+    "whilelt p2.s, XZR, x20\n"
+    "whilelt p0.s, x3, x2\n"
+    "ldp x7, x8, [x21, #0x10]\n"
+    "ldp x17, x16, [x4, #0x0]\n"
+    "add x15, %x[args], %[offsetof_rescale]\n"
+    "mov x14, #0x0\n"
+    "ldp x13, x12, [x4, #0x10]\n"
+    "ldp x11, x10, [x4, #0x20]\n"
+    "ldp x9, x28, [x4, #0x30]\n"
+    "ldp x27, x26, [x4, #0x40]\n"
+    "ldp x25, x24, [x4, #0x50]\n"
+    "ldp x23, x22, [x4, #0x60]\n"
+    "ldp x21, x20, [x4, #0x70]\n"
+    "ld1w { z7.s }, p0/Z, [x10, x3, LSL #2]\n"
+    "ld1w { z6.s }, p0/Z, [x9, x3, LSL #2]\n"
+    "ld1w { z5.s }, p0/Z, [x26, x3, LSL #2]\n"
+    "ld1w { z4.s }, p0/Z, [x25, x3, LSL #2]\n"
+    "ld1w { z3.s }, p0/Z, [x16, x3, LSL #2]\n"
+    "ld1w { z2.s }, p0/Z, [x13, x3, LSL #2]\n"
+    "ld1w { z1.s }, p0/Z, [x11, x3, LSL #2]\n"
+    "ld1w { z31.s }, p0/Z, [x27, x3, LSL #2]\n"
+    "ld1w { z30.s }, p0/Z, [x28, x3, LSL #2]\n"
+    "ld1w { z29.s }, p0/Z, [x24, x3, LSL #2]\n"
+    "ld1w { z28.s }, p0/Z, [x22, x3, LSL #2]\n"
+    "ld1w { z27.s }, p0/Z, [x21, x3, LSL #2]\n"
+    "ld1w { z26.s }, p0/Z, [x17, x3, LSL #2]\n"
+    "ld1w { z25.s }, p0/Z, [x12, x3, LSL #2]\n"
+    "ld1w { z24.s }, p0/Z, [x23, x3, LSL #2]\n"
+    "ld1w { z23.s }, p0/Z, [x20, x3, LSL #2]\n"
+    "incw x3\n"
+    "whilelt p1.s, x3, x2\n"
+    "ld1rqw { z0.s }, p2/Z, [x15]\n"
     "b.none 2f\n"
     "1:"  // Vector: Loop
-    "fadd z17.s, z8.s, z6.s\n"
-    "ld1w { z8.s }, p1/Z, [x9, x4, LSL #2]\n"
-    "whilelt p0.s, x5, x3\n"
+    "fadd z17.s, z7.s, z6.s\n"
     "fadd z16.s, z5.s, z4.s\n"
-    "ld1w { z6.s }, p1/Z, [x28, x4, LSL #2]\n"
+    "ld1w { z7.s }, p1/Z, [x10, x3, LSL #2]\n"
+    "ld1w { z6.s }, p1/Z, [x9, x3, LSL #2]\n"
+    "fadd z19.s, z17.s, z16.s\n"
     "fadd z18.s, z3.s, z2.s\n"
-    "ld1w { z5.s }, p1/Z, [x25, x4, LSL #2]\n"
-    "fadd z23.s, z1.s, z0.s\n"
-    "ld1w { z4.s }, p1/Z, [x24, x4, LSL #2]\n"
-    "fadd z22.s, z31.s, z30.s\n"
-    "ld1w { z3.s }, p1/Z, [x13, x4, LSL #2]\n"
-    "fadd z17.s, z17.s, z16.s\n"
-    "ld1w { z2.s }, p1/Z, [x12, x4, LSL #2]\n"
-    "fadd z16.s, z29.s, z28.s\n"
-    "ld1w { z1.s }, p1/Z, [x10, x4, LSL #2]\n"
-    "fadd z19.s, z27.s, z23.s\n"
-    "ld1w { z0.s }, p1/Z, [x26, x4, LSL #2]\n"
-    "fadd z21.s, z18.s, z17.s\n"
-    "ld1w { z31.s }, p1/Z, [x27, x4, LSL #2]\n"
-    "fadd z20.s, z16.s, z17.s\n"
-    "ld1w { z30.s }, p1/Z, [x23, x4, LSL #2]\n"
-    "fadd z18.s, z26.s, z22.s\n"
-    "ld1w { z29.s }, p1/Z, [x21, x4, LSL #2]\n"
-    "fadd z17.s, z25.s, z23.s\n"
-    "ld1w { z28.s }, p1/Z, [x20, x4, LSL #2]\n"
-    "fadd z16.s, z24.s, z22.s\n"
-    "ld1w { z27.s }, p1/Z, [x14, x4, LSL #2]\n"
+    "ld1w { z5.s }, p1/Z, [x26, x3, LSL #2]\n"
+    "ld1w { z4.s }, p1/Z, [x25, x3, LSL #2]\n"
+    "fadd z17.s, z1.s, z31.s\n"
+    "fadd z22.s, z30.s, z29.s\n"
+    "ld1w { z3.s }, p1/Z, [x16, x3, LSL #2]\n"
+    "ld1w { z2.s }, p1/Z, [x13, x3, LSL #2]\n"
+    "fadd z16.s, z28.s, z27.s\n"
+    "fadd z21.s, z18.s, z19.s\n"
+    "ld1w { z1.s }, p1/Z, [x11, x3, LSL #2]\n"
+    "ld1w { z31.s }, p1/Z, [x27, x3, LSL #2]\n"
+    "fadd z20.s, z16.s, z19.s\n"
+    "fadd z19.s, z26.s, z17.s\n"
+    "ld1w { z30.s }, p1/Z, [x28, x3, LSL #2]\n"
+    "ld1w { z29.s }, p1/Z, [x24, x3, LSL #2]\n"
+    "fadd z18.s, z25.s, z22.s\n"
+    "fadd z17.s, z24.s, z17.s\n"
+    "ld1w { z28.s }, p1/Z, [x22, x3, LSL #2]\n"
+    "ld1w { z27.s }, p1/Z, [x21, x3, LSL #2]\n"
+    "fadd z16.s, z23.s, z22.s\n"
+    "ld1w { z26.s }, p1/Z, [x17, x3, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [x12, x3, LSL #2]\n"
     "fadd z19.s, z21.s, z19.s\n"
-    "ld1w { z26.s }, p1/Z, [x11, x4, LSL #2]\n"
+    "ld1w { z24.s }, p1/Z, [x23, x3, LSL #2]\n"
+    "ld1w { z23.s }, p1/Z, [x20, x3, LSL #2]\n"
+    "incw x3\n"
     "fadd z18.s, z21.s, z18.s\n"
-    "ld1w { z25.s }, p1/Z, [x22, x4, LSL #2]\n"
     "fadd z17.s, z17.s, z20.s\n"
-    "ld1w { z24.s }, p1/Z, [x19, x4, LSL #2]\n"
-    "incw x4\n"
-    "fadd z16.s, z20.s, z16.s\n"
-    "whilelt p1.s, x4, x3\n"
-    "fmul z19.s, z19.s, z7.s[0]\n"
-    "st1w { z19.s }, p0, [x8, x5, LSL #2]\n"
-    "fmul z18.s, z18.s, z7.s[1]\n"
-    "fmul z17.s, z17.s, z7.s[2]\n"
-    "st1w { z18.s }, p0, [x17, x5, LSL #2]\n"
-    "fmul z16.s, z16.s, z7.s[3]\n"
-    "st1w { z17.s }, p0, [x16, x5, LSL #2]\n"
-    "st1w { z16.s }, p0, [x15, x5, LSL #2]\n"
-    "incw x5\n"
+    "fadd z16.s, z16.s, z20.s\n"
+    "whilelt p0.s, x14, x2\n"
+    "whilelt p1.s, x3, x2\n"
+    "fmul z19.s, z19.s, z0.s[0]\n"
+    "fmul z18.s, z18.s, z0.s[1]\n"
+    "st1w { z19.s }, p0, [x5, x14, LSL #2]\n"
+    "fmul z17.s, z17.s, z0.s[2]\n"
+    "fmul z16.s, z16.s, z0.s[3]\n"
+    "st1w { z18.s }, p0, [x6, x14, LSL #2]\n"
+    "st1w { z17.s }, p0, [x7, x14, LSL #2]\n"
+    "st1w { z16.s }, p0, [x8, x14, LSL #2]\n"
+    "incw x14\n"
     "b.any 1b\n"
     "2:"  // Vector: Tail
-    "fadd z17.s, z8.s, z6.s\n"
-    "whilelt p0.s, x5, x3\n"
+    "fadd z17.s, z7.s, z6.s\n"
     "fadd z16.s, z5.s, z4.s\n"
+    "whilelt p0.s, x14, x2\n"
+    "fadd z20.s, z17.s, z16.s\n"
     "fadd z18.s, z3.s, z2.s\n"
-    "fadd z23.s, z1.s, z0.s\n"
-    "fadd z17.s, z17.s, z16.s\n"
-    "fadd z22.s, z31.s, z30.s\n"
-    "fadd z16.s, z29.s, z28.s\n"
-    "fadd z21.s, z18.s, z17.s\n"
-    "fadd z19.s, z27.s, z23.s\n"
-    "fadd z20.s, z16.s, z17.s\n"
-    "fadd z18.s, z26.s, z22.s\n"
-    "fadd z17.s, z25.s, z23.s\n"
-    "fadd z16.s, z24.s, z22.s\n"
-    "fadd z19.s, z21.s, z19.s\n"
+    "fadd z17.s, z1.s, z31.s\n"
+    "fadd z19.s, z30.s, z29.s\n"
+    "fadd z16.s, z28.s, z27.s\n"
+    "fadd z21.s, z18.s, z20.s\n"
+    "fadd z20.s, z16.s, z20.s\n"
+    "fadd z16.s, z26.s, z17.s\n"
+    "fadd z18.s, z25.s, z19.s\n"
+    "fadd z17.s, z24.s, z17.s\n"
+    "fadd z19.s, z23.s, z19.s\n"
+    "fadd z16.s, z21.s, z16.s\n"
+    "fmul z16.s, z16.s, z0.s[0]\n"
+    "st1w { z16.s }, p0, [x5, x14, LSL #2]\n"
     "fadd z18.s, z21.s, z18.s\n"
     "fadd z17.s, z17.s, z20.s\n"
-    "fadd z16.s, z20.s, z16.s\n"
-    "fmul z19.s, z19.s, z7.s[0]\n"
-    "st1w { z19.s }, p0, [x8, x5, LSL #2]\n"
-    "fmul z18.s, z18.s, z7.s[1]\n"
-    "fmul z17.s, z17.s, z7.s[2]\n"
-    "st1w { z18.s }, p0, [x17, x5, LSL #2]\n"
-    "fmul z16.s, z16.s, z7.s[3]\n"
-    "st1w { z17.s }, p0, [x16, x5, LSL #2]\n"
-    "st1w { z16.s }, p0, [x15, x5, LSL #2]\n"
+    "fmul z18.s, z18.s, z0.s[1]\n"
+    "fmul z17.s, z17.s, z0.s[2]\n"
+    "fadd z16.s, z19.s, z20.s\n"
+    "fmul z16.s, z16.s, z0.s[3]\n"
+    "st1w { z18.s }, p0, [x6, x14, LSL #2]\n"
+    "st1w { z17.s }, p0, [x7, x14, LSL #2]\n"
+    "st1w { z16.s }, p0, [x8, x14, LSL #2]\n"
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
-    : "cc", "memory", "p0", "p1", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst.hpp
index 0daa046a02..a9e6b034e7 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_fp32_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
 
-struct sve_fp32_nhwc_avg_generic_depthfirst
+struct sve_fp32_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
 {
-  typedef float operand_type;
-  typedef float return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = sve_fp32_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<float, float>;
   sve_fp32_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_fp32_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
index c2f5745adc..7c94894892 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,9 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -41,88 +42,88 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
   const auto rescale_value = static_cast<float>(1.0f / static_cast<float>(window_cells));
 
   __asm__ __volatile__(
+    "mov x9, #0x0\n"
+    "cntw x28\n"
+    "cntw x27, ALL, MUL #2\n"
+    "cntw x26, ALL, MUL #3\n"
     "ptrue p0.b\n"
-    "ld1rw { z8.s }, p0/Z, [%x[rescale_ptr]]\n"
-    "mov x28, #0x0\n"
-    "cntw x27\n"
-    "cntw x26, ALL, MUL #2\n"
-    "cntw x25, ALL, MUL #3\n"
-    "whilelt p3.s, x28, %x[n_channels]\n"
-    "whilelt p2.s, x27, %x[n_channels]\n"
-    "whilelt p1.s, x26, %x[n_channels]\n"
-    "whilelt p0.s, x25, %x[n_channels]\n"
+    "whilelt p3.s, x9, %x[n_channels]\n"
+    "ld1rw { z7.s }, p0/Z, [%x[rescale_ptr]]\n"
+    "whilelt p2.s, x28, %x[n_channels]\n"
+    "whilelt p1.s, x27, %x[n_channels]\n"
+    "whilelt p0.s, x26, %x[n_channels]\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
-    "mov z7.b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
     "mov z6.b, #0x0\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "mov z5.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
     "mov z4.b, #0x0\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
-    "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
-    "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
-    "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
-    "ld1w { z30.s }, p2/Z, [x22, x27, LSL #2]\n"
-    "ld1w { z29.s }, p2/Z, [x21, x27, LSL #2]\n"
-    "ld1w { z28.s }, p2/Z, [x20, x27, LSL #2]\n"
-    "ld1w { z27.s }, p1/Z, [x23, x26, LSL #2]\n"
-    "ld1w { z21.s }, p1/Z, [x22, x26, LSL #2]\n"
-    "ld1w { z26.s }, p1/Z, [x21, x26, LSL #2]\n"
-    "ld1w { z17.s }, p1/Z, [x20, x26, LSL #2]\n"
-    "ld1w { z25.s }, p0/Z, [x23, x25, LSL #2]\n"
-    "ld1w { z20.s }, p0/Z, [x22, x25, LSL #2]\n"
-    "ld1w { z24.s }, p0/Z, [x21, x25, LSL #2]\n"
-    "ld1w { z16.s }, p0/Z, [x20, x25, LSL #2]\n"
+    "mov z3.b, #0x0\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "ld1w { z30.s }, p2/Z, [x23, x28, LSL #2]\n"
+    "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
+    "ld1w { z29.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z28.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+    "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+    "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+    "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "fadd z23.s, z3.s, z2.s\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fadd z19.s, z1.s, z0.s\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fadd z22.s, z31.s, z30.s\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
+    "fadd z23.s, z2.s, z1.s\n"
+    "fadd z19.s, z0.s, z31.s\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fadd z22.s, z30.s, z22.s\n"
     "fadd z18.s, z29.s, z28.s\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
     "fadd z21.s, z27.s, z21.s\n"
-    "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
     "fadd z17.s, z26.s, z17.s\n"
-    "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
     "fadd z20.s, z25.s, z20.s\n"
-    "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
     "fadd z16.s, z24.s, z16.s\n"
-    "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
     "fadd z19.s, z23.s, z19.s\n"
-    "ld1w { z30.s }, p2/Z, [x22, x27, LSL #2]\n"
     "fadd z18.s, z22.s, z18.s\n"
-    "ld1w { z29.s }, p2/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z30.s }, p2/Z, [x23, x28, LSL #2]\n"
+    "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
     "fadd z17.s, z21.s, z17.s\n"
-    "ld1w { z28.s }, p2/Z, [x20, x27, LSL #2]\n"
     "fadd z16.s, z20.s, z16.s\n"
-    "ld1w { z27.s }, p1/Z, [x23, x26, LSL #2]\n"
-    "fadd z7.s, z7.s, z19.s\n"
-    "ld1w { z21.s }, p1/Z, [x22, x26, LSL #2]\n"
-    "fadd z6.s, z6.s, z18.s\n"
-    "ld1w { z26.s }, p1/Z, [x21, x26, LSL #2]\n"
-    "fadd z5.s, z5.s, z17.s\n"
-    "ld1w { z17.s }, p1/Z, [x20, x26, LSL #2]\n"
-    "fadd z4.s, z4.s, z16.s\n"
-    "ld1w { z25.s }, p0/Z, [x23, x25, LSL #2]\n"
-    "ld1w { z20.s }, p0/Z, [x22, x25, LSL #2]\n"
-    "ld1w { z24.s }, p0/Z, [x21, x25, LSL #2]\n"
-    "ld1w { z16.s }, p0/Z, [x20, x25, LSL #2]\n"
+    "ld1w { z29.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z28.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "fadd z6.s, z6.s, z19.s\n"
+    "fadd z5.s, z5.s, z18.s\n"
+    "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+    "fadd z4.s, z4.s, z17.s\n"
+    "fadd z3.s, z3.s, z16.s\n"
+    "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+    "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+    "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "fadd z23.s, z3.s, z2.s\n"
-    "fadd z19.s, z1.s, z0.s\n"
-    "fadd z22.s, z31.s, z30.s\n"
+    "fadd z23.s, z2.s, z1.s\n"
+    "fadd z19.s, z0.s, z31.s\n"
+    "fadd z22.s, z30.s, z22.s\n"
     "fadd z18.s, z29.s, z28.s\n"
     "fadd z21.s, z27.s, z21.s\n"
     "fadd z17.s, z26.s, z17.s\n"
@@ -132,100 +133,99 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
     "fadd z18.s, z22.s, z18.s\n"
     "fadd z17.s, z21.s, z17.s\n"
     "fadd z16.s, z20.s, z16.s\n"
-    "fadd z7.s, z7.s, z19.s\n"
-    "fadd z6.s, z6.s, z18.s\n"
-    "fadd z5.s, z5.s, z17.s\n"
-    "fadd z4.s, z4.s, z16.s\n"
+    "fadd z6.s, z6.s, z19.s\n"
+    "fadd z5.s, z5.s, z18.s\n"
+    "fadd z4.s, z4.s, z17.s\n"
+    "fadd z3.s, z3.s, z16.s\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "fadd z7.s, z7.s, z3.s\n"
-    "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
-    "ld1w { z27.s }, p1/Z, [x23, x26, LSL #2]\n"
-    "fadd z6.s, z6.s, z31.s\n"
-    "ld1w { z25.s }, p0/Z, [x23, x25, LSL #2]\n"
-    "fadd z5.s, z5.s, z27.s\n"
-    "fadd z4.s, z4.s, z25.s\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd z6.s, z6.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z16.s }, p1/Z, [x20, x27, LSL #2]\n"
+    "fadd z5.s, z5.s, z17.s\n"
+    "fadd z4.s, z4.s, z16.s\n"
+    "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+    "fadd z3.s, z3.s, z16.s\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "fmul z7.s, z7.s, z8.s\n"
-    "st1w { z7.s }, p3, [%x[outptr], x28, LSL #2]\n"
-    "fmul z6.s, z6.s, z8.s\n"
+    "fmul z6.s, z6.s, z7.s\n"
+    "fmul z5.s, z5.s, z7.s\n"
+    "st1w { z6.s }, p3, [%x[outptr], x9, LSL #2]\n"
+    "fmul z4.s, z4.s, z7.s\n"
+    "fmul z3.s, z3.s, z7.s\n"
+    "st1w { z5.s }, p2, [%x[outptr], x28, LSL #2]\n"
+    "st1w { z4.s }, p1, [%x[outptr], x27, LSL #2]\n"
+    "incw x9, ALL, MUL #4\n"
     "incw x28, ALL, MUL #4\n"
-    "fmul z5.s, z5.s, z8.s\n"
-    "st1w { z6.s }, p2, [%x[outptr], x27, LSL #2]\n"
-    "fmul z4.s, z4.s, z8.s\n"
-    "incw x27, ALL, MUL #4\n"
-    "st1w { z5.s }, p1, [%x[outptr], x26, LSL #2]\n"
+    "st1w { z3.s }, p0, [%x[outptr], x26, LSL #2]\n"
     "incw x26, ALL, MUL #4\n"
-    "st1w { z4.s }, p0, [%x[outptr], x25, LSL #2]\n"
-    "incw x25, ALL, MUL #4\n"
-    "whilelt p0.s, x25, %x[n_channels]\n"
+    "whilelt p0.s, x26, %x[n_channels]\n"
+    "incw x27, ALL, MUL #4\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.s, x28, %x[n_channels]\n"
+    "whilelt p3.s, x9, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov z7.b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
-    "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
-    "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z6.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "fadd z23.s, z3.s, z2.s\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fadd z19.s, z1.s, z0.s\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fadd z19.s, z23.s, z19.s\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
-    "fadd z7.s, z7.s, z19.s\n"
-    "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
-    "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "fadd z17.s, z2.s, z1.s\n"
+    "fadd z16.s, z0.s, z31.s\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fadd z16.s, z17.s, z16.s\n"
+    "subs x25, x25, #0x1\n"
+    "fadd z6.s, z6.s, z16.s\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "fadd z23.s, z3.s, z2.s\n"
-    "fadd z19.s, z1.s, z0.s\n"
-    "fadd z19.s, z23.s, z19.s\n"
-    "fadd z7.s, z7.s, z19.s\n"
+    "fadd z17.s, z2.s, z1.s\n"
+    "fadd z16.s, z0.s, z31.s\n"
+    "fadd z16.s, z17.s, z16.s\n"
+    "fadd z6.s, z6.s, z16.s\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "fadd z7.s, z7.s, z3.s\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd z6.s, z6.s, z16.s\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "fmul z7.s, z7.s, z8.s\n"
-    "st1w { z7.s }, p3, [%x[outptr], x28, LSL #2]\n"
-    "incw x28\n"
-    "whilelt p3.s, x28, %x[n_channels]\n"
+    "fmul z6.s, z6.s, z7.s\n"
+    "st1w { z6.s }, p3, [%x[outptr], x9, LSL #2]\n"
+    "incw x9\n"
+    "whilelt p3.s, x9, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 086f49e957..b97e3623c4 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,28 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst
+struct sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
 {
-  typedef float operand_type;
-  typedef float return_type;
+  using Parent = DepthfirstStrategy<float, float>;
 
-  typedef void (*kern_type)(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+  sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int pool_rows(void) { return 2; }
-  constexpr static unsigned int pool_cols(void) { return 2; }
-
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
-
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
-  sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 250cc24226..d9cebd1363 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include <cstddef>
 #include <cstdint>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -63,84 +63,84 @@ void sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x14, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "mov x14, #0x0\n"
+    "whilelt p0.s, x14, x15\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x13, x12, [x21, #0x0]\n"
     "ptrue p2.b\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x13, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "mov x12, #0x0\n"
-    "ldp x11, x10, [x20, #0x0]\n"
-    "whilelt p1.s, x13, x14\n"
-    "ldp x9, x28, [x20, #0x10]\n"
-    "ldp x27, x26, [x19, #0x0]\n"
-    "ldp x25, x24, [x19, #0x10]\n"
-    "ldp x23, x22, [x19, #0x20]\n"
-    "ldp x21, x20, [x19, #0x30]\n"
-    "ldr x19, [x19, #0x40]\n"
-    "ld1w { z31.s }, p1/Z, [x26, x13, LSL #2]\n"
-    "ld1w { z30.s }, p1/Z, [x23, x13, LSL #2]\n"
-    "ld1w { z29.s }, p1/Z, [x20, x13, LSL #2]\n"
-    "ld1w { z28.s }, p1/Z, [x24, x13, LSL #2]\n"
-    "ld1w { z27.s }, p1/Z, [x27, x13, LSL #2]\n"
-    "ld1w { z26.s }, p1/Z, [x22, x13, LSL #2]\n"
-    "ld1w { z25.s }, p1/Z, [x25, x13, LSL #2]\n"
-    "ld1w { z24.s }, p1/Z, [x21, x13, LSL #2]\n"
-    "ld1w { z23.s }, p1/Z, [x19, x13, LSL #2]\n"
-    "incw x13\n"
-    "whilelt p1.s, x13, x14\n"
+    "mov x11, #0x0\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "ldp x28, x27, [x20, #0x0]\n"
+    "ldp x26, x25, [x20, #0x10]\n"
+    "ldp x24, x23, [x20, #0x20]\n"
+    "ldp x22, x21, [x20, #0x30]\n"
+    "ldr x20, [x20, #0x40]\n"
+    "ld1w { z31.s }, p0/Z, [x27, x14, LSL #2]\n"
+    "ld1w { z30.s }, p0/Z, [x24, x14, LSL #2]\n"
+    "ld1w { z29.s }, p0/Z, [x21, x14, LSL #2]\n"
+    "ld1w { z28.s }, p0/Z, [x25, x14, LSL #2]\n"
+    "ld1w { z27.s }, p0/Z, [x28, x14, LSL #2]\n"
+    "ld1w { z26.s }, p0/Z, [x26, x14, LSL #2]\n"
+    "ld1w { z25.s }, p0/Z, [x23, x14, LSL #2]\n"
+    "ld1w { z24.s }, p0/Z, [x22, x14, LSL #2]\n"
+    "ld1w { z23.s }, p0/Z, [x20, x14, LSL #2]\n"
+    "incw x14\n"
+    "whilelt p1.s, x14, x15\n"
     "b.none 2f\n"
     "1:"  // Vector: Loop
     "movprfx z22, z31\n fmax z22.s, p2/M, z22.s, z30.s\n"
-    "ld1w { z31.s }, p1/Z, [x26, x13, LSL #2]\n"
-    "whilelt p0.s, x12, x14\n"
     "movprfx z21, z30\n fmax z21.s, p2/M, z21.s, z29.s\n"
-    "ld1w { z30.s }, p1/Z, [x23, x13, LSL #2]\n"
-    "movprfx z18, z28\n fmax z18.s, p2/M, z18.s, z27.s\n"
-    "ld1w { z29.s }, p1/Z, [x20, x13, LSL #2]\n"
-    "movprfx z17, z26\n fmax z17.s, p2/M, z17.s, z25.s\n"
-    "ld1w { z27.s }, p1/Z, [x27, x13, LSL #2]\n"
-    "movprfx z16, z24\n fmax z16.s, p2/M, z16.s, z28.s\n"
-    "ld1w { z28.s }, p1/Z, [x24, x13, LSL #2]\n"
-    "movprfx z20, z26\n fmax z20.s, p2/M, z20.s, z23.s\n"
-    "ld1w { z26.s }, p1/Z, [x22, x13, LSL #2]\n"
-    "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z18.s\n"
-    "ld1w { z25.s }, p1/Z, [x25, x13, LSL #2]\n"
-    "movprfx z18, z22\n fmax z18.s, p2/M, z18.s, z17.s\n"
-    "ld1w { z24.s }, p1/Z, [x21, x13, LSL #2]\n"
-    "movprfx z17, z21\n fmax z17.s, p2/M, z17.s, z16.s\n"
-    "ld1w { z23.s }, p1/Z, [x19, x13, LSL #2]\n"
-    "incw x13\n"
-    "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z20.s\n"
-    "st1w { z19.s }, p0, [x11, x12, LSL #2]\n"
-    "whilelt p1.s, x13, x14\n"
-    "st1w { z18.s }, p0, [x10, x12, LSL #2]\n"
-    "st1w { z17.s }, p0, [x9, x12, LSL #2]\n"
-    "st1w { z16.s }, p0, [x28, x12, LSL #2]\n"
-    "incw x12\n"
+    "ld1w { z31.s }, p1/Z, [x27, x14, LSL #2]\n"
+    "ld1w { z30.s }, p1/Z, [x24, x14, LSL #2]\n"
+    "movprfx z20, z28\n fmax z20.s, p2/M, z20.s, z27.s\n"
+    "movprfx z19, z26\n fmax z19.s, p2/M, z19.s, z25.s\n"
+    "ld1w { z29.s }, p1/Z, [x21, x14, LSL #2]\n"
+    "ld1w { z27.s }, p1/Z, [x28, x14, LSL #2]\n"
+    "movprfx z17, z28\n fmax z17.s, p2/M, z17.s, z24.s\n"
+    "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z23.s\n"
+    "ld1w { z28.s }, p1/Z, [x25, x14, LSL #2]\n"
+    "ld1w { z26.s }, p1/Z, [x26, x14, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [x23, x14, LSL #2]\n"
+    "ld1w { z24.s }, p1/Z, [x22, x14, LSL #2]\n"
+    "whilelt p0.s, x11, x15\n"
+    "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
+    "ld1w { z23.s }, p1/Z, [x20, x14, LSL #2]\n"
+    "incw x14\n"
+    "whilelt p1.s, x14, x15\n"
+    "st1w { z16.s }, p0, [x13, x11, LSL #2]\n"
+    "movprfx z16, z19\n fmax z16.s, p2/M, z16.s, z22.s\n"
+    "fmax z17.s, p2/M, z17.s, z21.s\n"
+    "st1w { z16.s }, p0, [x12, x11, LSL #2]\n"
+    "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z18.s\n"
+    "st1w { z17.s }, p0, [x10, x11, LSL #2]\n"
+    "st1w { z16.s }, p0, [x9, x11, LSL #2]\n"
+    "incw x11\n"
     "b.any 1b\n"
     "2:"  // Vector: Tail
     "movprfx z22, z31\n fmax z22.s, p2/M, z22.s, z30.s\n"
-    "whilelt p0.s, x12, x14\n"
     "movprfx z21, z30\n fmax z21.s, p2/M, z21.s, z29.s\n"
-    "movprfx z18, z28\n fmax z18.s, p2/M, z18.s, z27.s\n"
-    "movprfx z17, z26\n fmax z17.s, p2/M, z17.s, z25.s\n"
-    "movprfx z16, z24\n fmax z16.s, p2/M, z16.s, z28.s\n"
-    "movprfx z20, z26\n fmax z20.s, p2/M, z20.s, z23.s\n"
-    "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z18.s\n"
-    "st1w { z19.s }, p0, [x11, x12, LSL #2]\n"
-    "movprfx z18, z22\n fmax z18.s, p2/M, z18.s, z17.s\n"
-    "movprfx z17, z21\n fmax z17.s, p2/M, z17.s, z16.s\n"
-    "st1w { z18.s }, p0, [x10, x12, LSL #2]\n"
-    "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z20.s\n"
-    "st1w { z17.s }, p0, [x9, x12, LSL #2]\n"
-    "st1w { z16.s }, p0, [x28, x12, LSL #2]\n"
+    "movprfx z20, z28\n fmax z20.s, p2/M, z20.s, z27.s\n"
+    "movprfx z19, z26\n fmax z19.s, p2/M, z19.s, z25.s\n"
+    "movprfx z17, z28\n fmax z17.s, p2/M, z17.s, z24.s\n"
+    "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z23.s\n"
+    "whilelt p0.s, x11, x15\n"
+    "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
+    "st1w { z16.s }, p0, [x13, x11, LSL #2]\n"
+    "movprfx z16, z19\n fmax z16.s, p2/M, z16.s, z22.s\n"
+    "fmax z17.s, p2/M, z17.s, z21.s\n"
+    "st1w { z16.s }, p0, [x12, x11, LSL #2]\n"
+    "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z18.s\n"
+    "st1w { z17.s }, p0, [x10, x11, LSL #2]\n"
+    "st1w { z16.s }, p0, [x9, x11, LSL #2]\n"
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
-    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst.hpp
index 17e3e5f0ba..5f6535072b 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_fp32_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
 
-struct sve_fp32_nhwc_max_generic_depthfirst
+struct sve_fp32_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
 {
-  typedef float operand_type;
-  typedef float return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = sve_fp32_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<float, float>;
   sve_fp32_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_fp32_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
index 8166379ce4..87fc75adda 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,9 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -39,185 +40,184 @@ void sve_fp32_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x28, #0x0\n"
-    "cntw x27\n"
-    "cntw x26, ALL, MUL #2\n"
-    "cntw x25, ALL, MUL #3\n"
+    "mov x9, #0x0\n"
+    "cntw x28\n"
+    "cntw x27, ALL, MUL #2\n"
+    "cntw x26, ALL, MUL #3\n"
+    "whilelt p4.s, x9, %x[n_channels]\n"
     "whilelt p3.s, x28, %x[n_channels]\n"
     "whilelt p2.s, x27, %x[n_channels]\n"
     "whilelt p1.s, x26, %x[n_channels]\n"
-    "whilelt p0.s, x25, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.s, #0xff800000\n"
     "mov z7.s, #0xff800000\n"
-    "mov x19, %x[inptrs]\n"
+    "mov x24, %x[inptrs]\n"
     "mov z6.s, #0xff800000\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "mov z5.s, #0xff800000\n"
-    "mov z4.s, #0xff800000\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
-    "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
-    "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
-    "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
-    "ld1w { z30.s }, p2/Z, [x22, x27, LSL #2]\n"
-    "ld1w { z22.s }, p2/Z, [x21, x27, LSL #2]\n"
-    "ld1w { z29.s }, p2/Z, [x20, x27, LSL #2]\n"
-    "ld1w { z28.s }, p1/Z, [x23, x26, LSL #2]\n"
-    "ld1w { z27.s }, p1/Z, [x22, x26, LSL #2]\n"
-    "ld1w { z21.s }, p1/Z, [x21, x26, LSL #2]\n"
-    "ld1w { z26.s }, p1/Z, [x20, x26, LSL #2]\n"
-    "ld1w { z16.s }, p0/Z, [x23, x25, LSL #2]\n"
-    "ld1w { z25.s }, p0/Z, [x22, x25, LSL #2]\n"
-    "ld1w { z20.s }, p0/Z, [x21, x25, LSL #2]\n"
-    "ld1w { z24.s }, p0/Z, [x20, x25, LSL #2]\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "ld1w { z0.s }, p3/Z, [x23, x28, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x22, x28, LSL #2]\n"
+    "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z30.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z29.s }, p2/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z28.s }, p2/Z, [x22, x27, LSL #2]\n"
+    "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z27.s }, p2/Z, [x20, x27, LSL #2]\n"
+    "ld1w { z26.s }, p1/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+    "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+    "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "movprfx z19, z3\n fmax z19.s, p4/M, z19.s, z2.s\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n fmax z23.s, p4/M, z23.s, z0.s\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "movprfx z18, z31\n fmax z18.s, p4/M, z18.s, z30.s\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "fmax z22.s, p4/M, z22.s, z29.s\n"
-    "movprfx z17, z28\n fmax z17.s, p4/M, z17.s, z27.s\n"
-    "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
-    "fmax z21.s, p4/M, z21.s, z26.s\n"
-    "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
-    "fmax z16.s, p4/M, z16.s, z25.s\n"
-    "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
-    "fmax z20.s, p4/M, z20.s, z24.s\n"
-    "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
-    "fmax z19.s, p4/M, z19.s, z23.s\n"
-    "ld1w { z30.s }, p2/Z, [x22, x27, LSL #2]\n"
-    "fmax z18.s, p4/M, z18.s, z22.s\n"
-    "ld1w { z22.s }, p2/Z, [x21, x27, LSL #2]\n"
-    "fmax z17.s, p4/M, z17.s, z21.s\n"
-    "ld1w { z29.s }, p2/Z, [x20, x27, LSL #2]\n"
-    "fmax z16.s, p4/M, z16.s, z20.s\n"
-    "ld1w { z28.s }, p1/Z, [x23, x26, LSL #2]\n"
-    "fmax z7.s, p4/M, z7.s, z19.s\n"
-    "ld1w { z27.s }, p1/Z, [x22, x26, LSL #2]\n"
-    "fmax z6.s, p4/M, z6.s, z18.s\n"
-    "ld1w { z21.s }, p1/Z, [x21, x26, LSL #2]\n"
-    "fmax z5.s, p4/M, z5.s, z17.s\n"
-    "ld1w { z26.s }, p1/Z, [x20, x26, LSL #2]\n"
-    "fmax z4.s, p4/M, z4.s, z16.s\n"
-    "ld1w { z16.s }, p0/Z, [x23, x25, LSL #2]\n"
-    "ld1w { z25.s }, p0/Z, [x22, x25, LSL #2]\n"
-    "ld1w { z20.s }, p0/Z, [x21, x25, LSL #2]\n"
-    "ld1w { z24.s }, p0/Z, [x20, x25, LSL #2]\n"
+    "movprfx z19, z4\n fmax z19.s, p0/M, z19.s, z3.s\n"
+    "movprfx z23, z2\n fmax z23.s, p0/M, z23.s, z1.s\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "movprfx z18, z0\n fmax z18.s, p0/M, z18.s, z31.s\n"
+    "fmax z22.s, p0/M, z22.s, z30.s\n"
+    "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+    "movprfx z17, z29\n fmax z17.s, p0/M, z17.s, z28.s\n"
+    "fmax z21.s, p0/M, z21.s, z27.s\n"
+    "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "movprfx z16, z26\n fmax z16.s, p0/M, z16.s, z25.s\n"
+    "fmax z20.s, p0/M, z20.s, z24.s\n"
+    "ld1w { z0.s }, p3/Z, [x23, x28, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x22, x28, LSL #2]\n"
+    "fmax z19.s, p0/M, z19.s, z23.s\n"
+    "fmax z18.s, p0/M, z18.s, z22.s\n"
+    "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z30.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "fmax z17.s, p0/M, z17.s, z21.s\n"
+    "fmax z16.s, p0/M, z16.s, z20.s\n"
+    "ld1w { z29.s }, p2/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z28.s }, p2/Z, [x22, x27, LSL #2]\n"
+    "subs x25, x25, #0x1\n"
+    "fmax z8.s, p0/M, z8.s, z19.s\n"
+    "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z27.s }, p2/Z, [x20, x27, LSL #2]\n"
+    "fmax z7.s, p0/M, z7.s, z18.s\n"
+    "fmax z6.s, p0/M, z6.s, z17.s\n"
+    "ld1w { z26.s }, p1/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+    "fmax z5.s, p0/M, z5.s, z16.s\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+    "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "movprfx z19, z3\n fmax z19.s, p4/M, z19.s, z2.s\n"
-    "movprfx z23, z1\n fmax z23.s, p4/M, z23.s, z0.s\n"
-    "movprfx z18, z31\n fmax z18.s, p4/M, z18.s, z30.s\n"
-    "fmax z22.s, p4/M, z22.s, z29.s\n"
-    "movprfx z17, z28\n fmax z17.s, p4/M, z17.s, z27.s\n"
-    "fmax z21.s, p4/M, z21.s, z26.s\n"
-    "fmax z16.s, p4/M, z16.s, z25.s\n"
-    "fmax z20.s, p4/M, z20.s, z24.s\n"
-    "fmax z19.s, p4/M, z19.s, z23.s\n"
-    "fmax z18.s, p4/M, z18.s, z22.s\n"
-    "fmax z17.s, p4/M, z17.s, z21.s\n"
-    "fmax z16.s, p4/M, z16.s, z20.s\n"
-    "fmax z7.s, p4/M, z7.s, z19.s\n"
-    "fmax z6.s, p4/M, z6.s, z18.s\n"
-    "fmax z5.s, p4/M, z5.s, z17.s\n"
-    "fmax z4.s, p4/M, z4.s, z16.s\n"
+    "movprfx z19, z4\n fmax z19.s, p0/M, z19.s, z3.s\n"
+    "movprfx z23, z2\n fmax z23.s, p0/M, z23.s, z1.s\n"
+    "movprfx z18, z0\n fmax z18.s, p0/M, z18.s, z31.s\n"
+    "fmax z22.s, p0/M, z22.s, z30.s\n"
+    "movprfx z17, z29\n fmax z17.s, p0/M, z17.s, z28.s\n"
+    "fmax z21.s, p0/M, z21.s, z27.s\n"
+    "movprfx z16, z26\n fmax z16.s, p0/M, z16.s, z25.s\n"
+    "fmax z20.s, p0/M, z20.s, z24.s\n"
+    "fmax z19.s, p0/M, z19.s, z23.s\n"
+    "fmax z18.s, p0/M, z18.s, z22.s\n"
+    "fmax z17.s, p0/M, z17.s, z21.s\n"
+    "fmax z16.s, p0/M, z16.s, z20.s\n"
+    "fmax z8.s, p0/M, z8.s, z19.s\n"
+    "fmax z7.s, p0/M, z7.s, z18.s\n"
+    "fmax z6.s, p0/M, z6.s, z17.s\n"
+    "fmax z5.s, p0/M, z5.s, z16.s\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "fmax z7.s, p4/M, z7.s, z3.s\n"
-    "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
-    "ld1w { z28.s }, p1/Z, [x23, x26, LSL #2]\n"
-    "fmax z6.s, p4/M, z6.s, z31.s\n"
-    "ld1w { z16.s }, p0/Z, [x23, x25, LSL #2]\n"
-    "fmax z5.s, p4/M, z5.s, z28.s\n"
-    "fmax z4.s, p4/M, z4.s, z16.s\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax z8.s, p0/M, z8.s, z16.s\n"
+    "ld1w { z17.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x20, x27, LSL #2]\n"
+    "fmax z7.s, p0/M, z7.s, z17.s\n"
+    "fmax z6.s, p0/M, z6.s, z16.s\n"
+    "ld1w { z16.s }, p1/Z, [x20, x26, LSL #2]\n"
+    "fmax z5.s, p0/M, z5.s, z16.s\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
+    "st1w { z8.s }, p4, [%x[outptr], x9, LSL #2]\n"
+    "incw x9, ALL, MUL #4\n"
     "st1w { z7.s }, p3, [%x[outptr], x28, LSL #2]\n"
     "incw x28, ALL, MUL #4\n"
     "st1w { z6.s }, p2, [%x[outptr], x27, LSL #2]\n"
     "incw x27, ALL, MUL #4\n"
     "st1w { z5.s }, p1, [%x[outptr], x26, LSL #2]\n"
     "incw x26, ALL, MUL #4\n"
-    "st1w { z4.s }, p0, [%x[outptr], x25, LSL #2]\n"
-    "incw x25, ALL, MUL #4\n"
-    "whilelt p0.s, x25, %x[n_channels]\n"
+    "whilelt p1.s, x26, %x[n_channels]\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.s, x28, %x[n_channels]\n"
+    "whilelt p4.s, x9, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov z7.s, #0xff800000\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
-    "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
-    "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.s, #0xff800000\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "movprfx z19, z3\n fmax z19.s, p4/M, z19.s, z2.s\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n fmax z23.s, p4/M, z23.s, z0.s\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fmax z19.s, p4/M, z19.s, z23.s\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
-    "fmax z7.s, p4/M, z7.s, z19.s\n"
-    "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
-    "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "movprfx z16, z4\n fmax z16.s, p0/M, z16.s, z3.s\n"
+    "movprfx z17, z2\n fmax z17.s, p0/M, z17.s, z1.s\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fmax z16.s, p0/M, z16.s, z17.s\n"
+    "subs x25, x25, #0x1\n"
+    "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+    "fmax z8.s, p0/M, z8.s, z16.s\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "movprfx z19, z3\n fmax z19.s, p4/M, z19.s, z2.s\n"
-    "movprfx z23, z1\n fmax z23.s, p4/M, z23.s, z0.s\n"
-    "fmax z19.s, p4/M, z19.s, z23.s\n"
-    "fmax z7.s, p4/M, z7.s, z19.s\n"
+    "movprfx z16, z4\n fmax z16.s, p0/M, z16.s, z3.s\n"
+    "movprfx z17, z2\n fmax z17.s, p0/M, z17.s, z1.s\n"
+    "fmax z16.s, p0/M, z16.s, z17.s\n"
+    "fmax z8.s, p0/M, z8.s, z16.s\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "fmax z7.s, p4/M, z7.s, z3.s\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax z8.s, p0/M, z8.s, z16.s\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "st1w { z7.s }, p3, [%x[outptr], x28, LSL #2]\n"
-    "incw x28\n"
-    "whilelt p3.s, x28, %x[n_channels]\n"
+    "st1w { z8.s }, p4, [%x[outptr], x9, LSL #2]\n"
+    "incw x9\n"
+    "whilelt p4.s, x9, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst.hpp
index 2ae38b5b2f..dd2ff4fd2e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_s8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
 
-struct sve_s8_nhwc_avg_generic_depthfirst
+struct sve_s8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = sve_s8_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
   sve_s8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_s8_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
index 2ea5b90561..7925905e64 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,11 +23,12 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 #include <cstring>
 #include <cmath>
 
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -84,30 +85,31 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
       f_rescale_value *= 2.0f;
     }
 
-    rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
-    if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
     {
       shift_value++;
-      rescale_value >>= 1;
+      long_rescale_value >>= 1;
     }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
   }
 
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x26, #0x0\n"
-    "cntb x25\n"
-    "cntb x24, ALL, MUL #2\n"
-    "cntb x23, ALL, MUL #3\n"
+    "mov x27, #0x0\n"
+    "cntb x26\n"
+    "cntb x25, ALL, MUL #2\n"
+    "cntb x24, ALL, MUL #3\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "whilelt p3.b, x26, %x[n_channels]\n"
     "whilelt p2.b, x25, %x[n_channels]\n"
     "whilelt p1.b, x24, %x[n_channels]\n"
-    "whilelt p0.b, x23, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov z15.s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "mov z14.s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "mov z13.s, #0x0\n"
     "mov z12.s, #0x0\n"
     "mov z11.s, #0x0\n"
@@ -122,43 +124,43 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
     "mov z2.s, #0x0\n"
     "mov z1.s, #0x0\n"
     "mov z0.s, #0x0\n"
-    "cbz x22, 4f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
-    "ld1b { z28.b }, p2/Z, [x20, x25]\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
-    "ld1b { z26.b }, p1/Z, [x20, x24]\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 2 inputs loop
     ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
     ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
     ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
-    "subs x22, x22, #0x1\n"
     ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
     ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
     ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
-    "ld1b { z28.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
     ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
     ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
-    "ld1b { z26.b }, p1/Z, [x20, x24]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
-    "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
     ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
     ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
     ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
@@ -198,219 +200,218 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
     ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
     ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x21, %x[n_valid_cells], #0x1\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x4508a3f1  // sshllb z17.h, z31.b, #0x0\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
-    ".inst 0x4508a7f0  // sshllt z16.h, z31.b, #0x0\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
-    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
-    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
-    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
-    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
-    ".inst 0x4508a3b0  // sshllb z16.h, z29.b, #0x0\n"
-    ".inst 0x4590416b  // saddwb z11.s, z11.s, z16.h\n"
-    ".inst 0x4590454a  // saddwt z10.s, z10.s, z16.h\n"
-    ".inst 0x4508a7b0  // sshllt z16.h, z29.b, #0x0\n"
-    ".inst 0x45904129  // saddwb z9.s, z9.s, z16.h\n"
-    ".inst 0x45904508  // saddwt z8.s, z8.s, z16.h\n"
-    ".inst 0x4508a370  // sshllb z16.h, z27.b, #0x0\n"
-    ".inst 0x459040e7  // saddwb z7.s, z7.s, z16.h\n"
-    ".inst 0x459044c6  // saddwt z6.s, z6.s, z16.h\n"
-    ".inst 0x4508a770  // sshllt z16.h, z27.b, #0x0\n"
-    ".inst 0x459040a5  // saddwb z5.s, z5.s, z16.h\n"
-    ".inst 0x45904484  // saddwt z4.s, z4.s, z16.h\n"
-    ".inst 0x4508a330  // sshllb z16.h, z25.b, #0x0\n"
-    ".inst 0x45904063  // saddwb z3.s, z3.s, z16.h\n"
-    ".inst 0x45904442  // saddwt z2.s, z2.s, z16.h\n"
-    ".inst 0x4508a730  // sshllt z16.h, z25.b, #0x0\n"
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508a217  // sshllb z23.h, z16.b, #0x0\n"
+    ".inst 0x4508a616  // sshllt z22.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x4508a215  // sshllb z21.h, z16.b, #0x0\n"
+    ".inst 0x4508a614  // sshllt z20.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x4508a233  // sshllb z19.h, z17.b, #0x0\n"
+    ".inst 0x4508a632  // sshllt z18.h, z17.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0x4508a211  // sshllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508a610  // sshllt z16.h, z16.b, #0x0\n"
+    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
     ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
     ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "mov z20.s, #0x7f\n"
-    "ld1rw { z17.s }, p4/Z, [%x[rescale_ptr]]\n"
-    "ld1rw { z16.s }, p4/Z, [%x[shift_ptr]]\n"
-    "not z19.s, p4/M, z20.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+    "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
     ".inst 0x04b175ef  // sqdmulh z15.s, z15.s, z17.s\n"
     ".inst 0x04b175ce  // sqdmulh z14.s, z14.s, z17.s\n"
     ".inst 0x04b175ad  // sqdmulh z13.s, z13.s, z17.s\n"
     ".inst 0x04b1758c  // sqdmulh z12.s, z12.s, z17.s\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
     ".inst 0x04b1756b  // sqdmulh z11.s, z11.s, z17.s\n"
     ".inst 0x04b1754a  // sqdmulh z10.s, z10.s, z17.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
     ".inst 0x04b17529  // sqdmulh z9.s, z9.s, z17.s\n"
     ".inst 0x04b17508  // sqdmulh z8.s, z8.s, z17.s\n"
+    ".inst 0x4482820b  // srshl z11.s, p0/M, z11.s, z16.s\n"
+    ".inst 0x4482820a  // srshl z10.s, p0/M, z10.s, z16.s\n"
     ".inst 0x04b174e7  // sqdmulh z7.s, z7.s, z17.s\n"
     ".inst 0x04b174c6  // sqdmulh z6.s, z6.s, z17.s\n"
+    ".inst 0x44828209  // srshl z9.s, p0/M, z9.s, z16.s\n"
+    ".inst 0x44828208  // srshl z8.s, p0/M, z8.s, z16.s\n"
     ".inst 0x04b174a5  // sqdmulh z5.s, z5.s, z17.s\n"
     ".inst 0x04b17484  // sqdmulh z4.s, z4.s, z17.s\n"
+    ".inst 0x44828207  // srshl z7.s, p0/M, z7.s, z16.s\n"
+    ".inst 0x44828206  // srshl z6.s, p0/M, z6.s, z16.s\n"
     ".inst 0x04b17463  // sqdmulh z3.s, z3.s, z17.s\n"
     ".inst 0x04b17442  // sqdmulh z2.s, z2.s, z17.s\n"
+    ".inst 0x44828205  // srshl z5.s, p0/M, z5.s, z16.s\n"
+    ".inst 0x44828204  // srshl z4.s, p0/M, z4.s, z16.s\n"
     ".inst 0x04b17421  // sqdmulh z1.s, z1.s, z17.s\n"
     ".inst 0x04b17400  // sqdmulh z0.s, z0.s, z17.s\n"
-    ".inst 0x4482920f  // srshl z15.s, p4/M, z15.s, z16.s\n"
-    ".inst 0x4482920e  // srshl z14.s, p4/M, z14.s, z16.s\n"
-    ".inst 0x4482920d  // srshl z13.s, p4/M, z13.s, z16.s\n"
-    ".inst 0x4482920c  // srshl z12.s, p4/M, z12.s, z16.s\n"
-    ".inst 0x4482920b  // srshl z11.s, p4/M, z11.s, z16.s\n"
-    ".inst 0x4482920a  // srshl z10.s, p4/M, z10.s, z16.s\n"
-    ".inst 0x44829209  // srshl z9.s, p4/M, z9.s, z16.s\n"
-    ".inst 0x44829208  // srshl z8.s, p4/M, z8.s, z16.s\n"
-    ".inst 0x44829207  // srshl z7.s, p4/M, z7.s, z16.s\n"
-    ".inst 0x44829206  // srshl z6.s, p4/M, z6.s, z16.s\n"
-    ".inst 0x44829205  // srshl z5.s, p4/M, z5.s, z16.s\n"
-    ".inst 0x44829204  // srshl z4.s, p4/M, z4.s, z16.s\n"
-    ".inst 0x44829203  // srshl z3.s, p4/M, z3.s, z16.s\n"
-    ".inst 0x44829202  // srshl z2.s, p4/M, z2.s, z16.s\n"
-    ".inst 0x44829201  // srshl z1.s, p4/M, z1.s, z16.s\n"
-    ".inst 0x44829200  // srshl z0.s, p4/M, z0.s, z16.s\n"
-    "smax z15.s, p4/M, z15.s, z19.s\n"
-    "smax z14.s, p4/M, z14.s, z19.s\n"
-    "smax z13.s, p4/M, z13.s, z19.s\n"
-    "smax z12.s, p4/M, z12.s, z19.s\n"
-    "smin z15.s, p4/M, z15.s, z20.s\n"
-    "smin z14.s, p4/M, z14.s, z20.s\n"
-    "smin z13.s, p4/M, z13.s, z20.s\n"
-    "smin z12.s, p4/M, z12.s, z20.s\n"
-    "smax z11.s, p4/M, z11.s, z19.s\n"
+    ".inst 0x44828203  // srshl z3.s, p0/M, z3.s, z16.s\n"
+    ".inst 0x44828202  // srshl z2.s, p0/M, z2.s, z16.s\n"
+    "mov z18.s, #0x7f\n"
+    ".inst 0x44828201  // srshl z1.s, p0/M, z1.s, z16.s\n"
+    ".inst 0x44828200  // srshl z0.s, p0/M, z0.s, z16.s\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smax z11.s, p0/M, z11.s, z16.s\n"
+    "smax z10.s, p0/M, z10.s, z16.s\n"
+    "smax z9.s, p0/M, z9.s, z16.s\n"
+    "smax z8.s, p0/M, z8.s, z16.s\n"
+    "smax z7.s, p0/M, z7.s, z16.s\n"
+    "smax z6.s, p0/M, z6.s, z16.s\n"
+    "smax z5.s, p0/M, z5.s, z16.s\n"
+    "smax z4.s, p0/M, z4.s, z16.s\n"
+    "smax z3.s, p0/M, z3.s, z16.s\n"
+    "smax z2.s, p0/M, z2.s, z16.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z18.s\n"
+    "smin z14.s, p0/M, z14.s, z18.s\n"
+    "smin z13.s, p0/M, z13.s, z18.s\n"
     "trn1 z17.h, z15.h, z14.h\n"
-    "smax z10.s, p4/M, z10.s, z19.s\n"
+    "smin z12.s, p0/M, z12.s, z18.s\n"
+    "smin z11.s, p0/M, z11.s, z18.s\n"
     "trn1 z16.h, z13.h, z12.h\n"
-    "smin z11.s, p4/M, z11.s, z20.s\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
-    "smin z10.s, p4/M, z10.s, z20.s\n"
-    "incb x26, ALL, MUL #4\n"
-    "smax z9.s, p4/M, z9.s, z19.s\n"
-    "smax z8.s, p4/M, z8.s, z19.s\n"
-    "smax z7.s, p4/M, z7.s, z19.s\n"
-    "smax z6.s, p4/M, z6.s, z19.s\n"
-    "trn1 z18.h, z11.h, z10.h\n"
-    "smin z9.s, p4/M, z9.s, z20.s\n"
-    "smin z8.s, p4/M, z8.s, z20.s\n"
-    "smin z7.s, p4/M, z7.s, z20.s\n"
-    "smin z6.s, p4/M, z6.s, z20.s\n"
-    "smax z5.s, p4/M, z5.s, z19.s\n"
+    "smin z10.s, p0/M, z10.s, z18.s\n"
+    "smin z9.s, p0/M, z9.s, z18.s\n"
+    "trn1 z17.h, z11.h, z10.h\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "smin z8.s, p0/M, z8.s, z18.s\n"
+    "smin z7.s, p0/M, z7.s, z18.s\n"
     "trn1 z16.h, z9.h, z8.h\n"
-    "smax z4.s, p4/M, z4.s, z19.s\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "smin z6.s, p0/M, z6.s, z18.s\n"
+    "smin z5.s, p0/M, z5.s, z18.s\n"
     "trn1 z17.h, z7.h, z6.h\n"
-    "trn1 z16.b, z18.b, z16.b\n"
-    "st1b { z16.b }, p2, [%x[outptr], x25]\n"
-    "smin z5.s, p4/M, z5.s, z20.s\n"
-    "incb x25, ALL, MUL #4\n"
-    "smin z4.s, p4/M, z4.s, z20.s\n"
-    "smax z3.s, p4/M, z3.s, z19.s\n"
-    "smax z2.s, p4/M, z2.s, z19.s\n"
-    "smax z1.s, p4/M, z1.s, z19.s\n"
-    "smax z0.s, p4/M, z0.s, z19.s\n"
+    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
+    "smin z4.s, p0/M, z4.s, z18.s\n"
+    "smin z3.s, p0/M, z3.s, z18.s\n"
     "trn1 z16.h, z5.h, z4.h\n"
-    "smin z3.s, p4/M, z3.s, z20.s\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
-    "smin z2.s, p4/M, z2.s, z20.s\n"
-    "incb x24, ALL, MUL #4\n"
-    "smin z1.s, p4/M, z1.s, z20.s\n"
-    "smin z0.s, p4/M, z0.s, z20.s\n"
+    "smin z2.s, p0/M, z2.s, z18.s\n"
+    "smin z1.s, p0/M, z1.s, z18.s\n"
     "trn1 z17.h, z3.h, z2.h\n"
+    "st1b { z16.b }, p2, [%x[outptr], x25]\n"
+    "smin z0.s, p0/M, z0.s, z18.s\n"
     "trn1 z16.h, z1.h, z0.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p0, [%x[outptr], x23]\n"
-    "incb x23, ALL, MUL #4\n"
-    "whilelt p0.b, x23, %x[n_channels]\n"
+    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+    "incb x24, ALL, MUL #4\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "incb x27, ALL, MUL #4\n"
+    "incb x26, ALL, MUL #4\n"
+    "incb x25, ALL, MUL #4\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.b, x26, %x[n_channels]\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov z15.s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "mov z14.s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "mov z13.s, #0x0\n"
     "mov z12.s, #0x0\n"
-    "cbz x22, 11f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 2 inputs loop
-    ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
-    ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
-    "subs x22, x22, #0x1\n"
-    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
-    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x455e03f1  // saddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e07f0  // saddlt z16.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 2 inputs tail
-    ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
-    ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
-    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
-    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
-    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
-    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x455e03f1  // saddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e07f0  // saddlt z16.h, z31.b, z30.b\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x21, %x[n_valid_cells], #0x1\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x4508a3f1  // sshllb z17.h, z31.b, #0x0\n"
-    ".inst 0x4508a7f0  // sshllt z16.h, z31.b, #0x0\n"
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508a211  // sshllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508a610  // sshllt z16.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
     ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
     ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
     ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
     ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "mov z20.s, #0x7f\n"
-    "ld1rw { z17.s }, p4/Z, [%x[rescale_ptr]]\n"
-    "ld1rw { z16.s }, p4/Z, [%x[shift_ptr]]\n"
-    "not z19.s, p4/M, z20.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+    "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
     ".inst 0x04b175ef  // sqdmulh z15.s, z15.s, z17.s\n"
     ".inst 0x04b175ce  // sqdmulh z14.s, z14.s, z17.s\n"
     ".inst 0x04b175ad  // sqdmulh z13.s, z13.s, z17.s\n"
     ".inst 0x04b1758c  // sqdmulh z12.s, z12.s, z17.s\n"
-    ".inst 0x4482920f  // srshl z15.s, p4/M, z15.s, z16.s\n"
-    ".inst 0x4482920e  // srshl z14.s, p4/M, z14.s, z16.s\n"
-    ".inst 0x4482920d  // srshl z13.s, p4/M, z13.s, z16.s\n"
-    ".inst 0x4482920c  // srshl z12.s, p4/M, z12.s, z16.s\n"
-    "smax z15.s, p4/M, z15.s, z19.s\n"
-    "smax z14.s, p4/M, z14.s, z19.s\n"
-    "smax z13.s, p4/M, z13.s, z19.s\n"
-    "smax z12.s, p4/M, z12.s, z19.s\n"
-    "smin z15.s, p4/M, z15.s, z20.s\n"
-    "smin z14.s, p4/M, z14.s, z20.s\n"
-    "smin z13.s, p4/M, z13.s, z20.s\n"
-    "smin z12.s, p4/M, z12.s, z20.s\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    "mov z18.s, #0x7f\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z18.s\n"
+    "smin z14.s, p0/M, z14.s, z18.s\n"
+    "smin z13.s, p0/M, z13.s, z18.s\n"
     "trn1 z17.h, z15.h, z14.h\n"
+    "smin z12.s, p0/M, z12.s, z18.s\n"
     "trn1 z16.h, z13.h, z12.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
-    "incb x26\n"
-    "whilelt p3.b, x26, %x[n_channels]\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 071e79c93d..ac842ac623 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,28 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const int8_t *const *const, int8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst
+struct sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<int8_t, int8_t>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
+  using Parent = DepthfirstStrategy<int8_t, int8_t>;
 
-  typedef void (*kern_type)(unsigned int, const int8_t *const *const, int8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+  sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int pool_rows(void) { return 2; }
-  constexpr static unsigned int pool_cols(void) { return 2; }
-
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
-
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
-  sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index bdf3f53292..5681cc1f3d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include <cstddef>
 #include <cstdint>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -63,84 +63,84 @@ void sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x14, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "mov x14, #0x0\n"
+    "whilelt p0.b, x14, x15\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x13, x12, [x21, #0x0]\n"
     "ptrue p2.b\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x13, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "mov x12, #0x0\n"
-    "ldp x11, x10, [x20, #0x0]\n"
-    "whilelt p1.b, x13, x14\n"
-    "ldp x9, x28, [x20, #0x10]\n"
-    "ldp x27, x26, [x19, #0x0]\n"
-    "ldp x25, x24, [x19, #0x10]\n"
-    "ldp x23, x22, [x19, #0x20]\n"
-    "ldp x21, x20, [x19, #0x30]\n"
-    "ldr x19, [x19, #0x40]\n"
-    "ld1b { z31.b }, p1/Z, [x26, x13]\n"
-    "ld1b { z30.b }, p1/Z, [x23, x13]\n"
-    "ld1b { z29.b }, p1/Z, [x20, x13]\n"
-    "ld1b { z28.b }, p1/Z, [x24, x13]\n"
-    "ld1b { z27.b }, p1/Z, [x27, x13]\n"
-    "ld1b { z26.b }, p1/Z, [x22, x13]\n"
-    "ld1b { z25.b }, p1/Z, [x25, x13]\n"
-    "ld1b { z24.b }, p1/Z, [x21, x13]\n"
-    "ld1b { z23.b }, p1/Z, [x19, x13]\n"
-    "incw x13\n"
-    "whilelt p1.b, x13, x14\n"
+    "mov x11, #0x0\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "ldp x28, x27, [x20, #0x0]\n"
+    "ldp x26, x25, [x20, #0x10]\n"
+    "ldp x24, x23, [x20, #0x20]\n"
+    "ldp x22, x21, [x20, #0x30]\n"
+    "ldr x20, [x20, #0x40]\n"
+    "ld1b { z31.b }, p0/Z, [x27, x14]\n"
+    "ld1b { z30.b }, p0/Z, [x24, x14]\n"
+    "ld1b { z29.b }, p0/Z, [x21, x14]\n"
+    "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+    "ld1b { z27.b }, p0/Z, [x28, x14]\n"
+    "ld1b { z26.b }, p0/Z, [x26, x14]\n"
+    "ld1b { z25.b }, p0/Z, [x23, x14]\n"
+    "ld1b { z24.b }, p0/Z, [x22, x14]\n"
+    "ld1b { z23.b }, p0/Z, [x20, x14]\n"
+    "incw x14\n"
+    "whilelt p1.b, x14, x15\n"
     "b.none 2f\n"
     "1:"  // Vector: Loop
     "movprfx z22, z31\n smax z22.b, p2/M, z22.b, z30.b\n"
-    "ld1b { z31.b }, p1/Z, [x26, x13]\n"
-    "whilelt p0.b, x12, x14\n"
     "movprfx z21, z30\n smax z21.b, p2/M, z21.b, z29.b\n"
-    "ld1b { z30.b }, p1/Z, [x23, x13]\n"
-    "movprfx z18, z28\n smax z18.b, p2/M, z18.b, z27.b\n"
-    "ld1b { z29.b }, p1/Z, [x20, x13]\n"
-    "movprfx z17, z26\n smax z17.b, p2/M, z17.b, z25.b\n"
-    "ld1b { z27.b }, p1/Z, [x27, x13]\n"
-    "movprfx z16, z24\n smax z16.b, p2/M, z16.b, z28.b\n"
-    "ld1b { z28.b }, p1/Z, [x24, x13]\n"
-    "movprfx z20, z26\n smax z20.b, p2/M, z20.b, z23.b\n"
-    "ld1b { z26.b }, p1/Z, [x22, x13]\n"
-    "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z18.b\n"
-    "ld1b { z25.b }, p1/Z, [x25, x13]\n"
-    "movprfx z18, z22\n smax z18.b, p2/M, z18.b, z17.b\n"
-    "ld1b { z24.b }, p1/Z, [x21, x13]\n"
-    "movprfx z17, z21\n smax z17.b, p2/M, z17.b, z16.b\n"
-    "ld1b { z23.b }, p1/Z, [x19, x13]\n"
-    "incw x13\n"
-    "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z20.b\n"
-    "st1b { z19.b }, p0, [x11, x12]\n"
-    "whilelt p1.b, x13, x14\n"
-    "st1b { z18.b }, p0, [x10, x12]\n"
-    "st1b { z17.b }, p0, [x9, x12]\n"
-    "st1b { z16.b }, p0, [x28, x12]\n"
-    "incw x12\n"
+    "ld1b { z31.b }, p1/Z, [x27, x14]\n"
+    "ld1b { z30.b }, p1/Z, [x24, x14]\n"
+    "movprfx z20, z28\n smax z20.b, p2/M, z20.b, z27.b\n"
+    "movprfx z19, z26\n smax z19.b, p2/M, z19.b, z25.b\n"
+    "ld1b { z29.b }, p1/Z, [x21, x14]\n"
+    "ld1b { z27.b }, p1/Z, [x28, x14]\n"
+    "movprfx z17, z28\n smax z17.b, p2/M, z17.b, z24.b\n"
+    "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z23.b\n"
+    "ld1b { z28.b }, p1/Z, [x25, x14]\n"
+    "ld1b { z26.b }, p1/Z, [x26, x14]\n"
+    "ld1b { z25.b }, p1/Z, [x23, x14]\n"
+    "ld1b { z24.b }, p1/Z, [x22, x14]\n"
+    "whilelt p0.b, x11, x15\n"
+    "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
+    "ld1b { z23.b }, p1/Z, [x20, x14]\n"
+    "incw x14\n"
+    "whilelt p1.b, x14, x15\n"
+    "st1b { z16.b }, p0, [x13, x11]\n"
+    "movprfx z16, z19\n smax z16.b, p2/M, z16.b, z22.b\n"
+    "smax z17.b, p2/M, z17.b, z21.b\n"
+    "st1b { z16.b }, p0, [x12, x11]\n"
+    "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z18.b\n"
+    "st1b { z17.b }, p0, [x10, x11]\n"
+    "st1b { z16.b }, p0, [x9, x11]\n"
+    "incw x11\n"
     "b.any 1b\n"
     "2:"  // Vector: Tail
     "movprfx z22, z31\n smax z22.b, p2/M, z22.b, z30.b\n"
-    "whilelt p0.b, x12, x14\n"
     "movprfx z21, z30\n smax z21.b, p2/M, z21.b, z29.b\n"
-    "movprfx z18, z28\n smax z18.b, p2/M, z18.b, z27.b\n"
-    "movprfx z17, z26\n smax z17.b, p2/M, z17.b, z25.b\n"
-    "movprfx z16, z24\n smax z16.b, p2/M, z16.b, z28.b\n"
-    "movprfx z20, z26\n smax z20.b, p2/M, z20.b, z23.b\n"
-    "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z18.b\n"
-    "st1b { z19.b }, p0, [x11, x12]\n"
-    "movprfx z18, z22\n smax z18.b, p2/M, z18.b, z17.b\n"
-    "movprfx z17, z21\n smax z17.b, p2/M, z17.b, z16.b\n"
-    "st1b { z18.b }, p0, [x10, x12]\n"
-    "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z20.b\n"
-    "st1b { z17.b }, p0, [x9, x12]\n"
-    "st1b { z16.b }, p0, [x28, x12]\n"
+    "movprfx z20, z28\n smax z20.b, p2/M, z20.b, z27.b\n"
+    "movprfx z19, z26\n smax z19.b, p2/M, z19.b, z25.b\n"
+    "movprfx z17, z28\n smax z17.b, p2/M, z17.b, z24.b\n"
+    "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z23.b\n"
+    "whilelt p0.b, x11, x15\n"
+    "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
+    "st1b { z16.b }, p0, [x13, x11]\n"
+    "movprfx z16, z19\n smax z16.b, p2/M, z16.b, z22.b\n"
+    "smax z17.b, p2/M, z17.b, z21.b\n"
+    "st1b { z16.b }, p0, [x12, x11]\n"
+    "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z18.b\n"
+    "st1b { z17.b }, p0, [x10, x11]\n"
+    "st1b { z16.b }, p0, [x9, x11]\n"
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
-    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst.hpp
index 428902ad61..2ee5bc0527 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_s8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
 
-struct sve_s8_nhwc_max_generic_depthfirst
+struct sve_s8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = sve_s8_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
   sve_s8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_s8_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
index 3e88c8729c..da9e1408f9 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,9 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -39,185 +40,184 @@ void sve_s8_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x28, #0x0\n"
-    "cntb x27\n"
-    "cntb x26, ALL, MUL #2\n"
-    "cntb x25, ALL, MUL #3\n"
+    "mov x9, #0x0\n"
+    "cntb x28\n"
+    "cntb x27, ALL, MUL #2\n"
+    "cntb x26, ALL, MUL #3\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "whilelt p3.b, x28, %x[n_channels]\n"
     "whilelt p2.b, x27, %x[n_channels]\n"
     "whilelt p1.b, x26, %x[n_channels]\n"
-    "whilelt p0.b, x25, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.b, #0x80\n"
     "mov z7.b, #0x80\n"
-    "mov x19, %x[inptrs]\n"
+    "mov x24, %x[inptrs]\n"
     "mov z6.b, #0x80\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "mov z5.b, #0x80\n"
-    "mov z4.b, #0x80\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "ld1b { z30.b }, p2/Z, [x22, x27]\n"
-    "ld1b { z22.b }, p2/Z, [x21, x27]\n"
-    "ld1b { z29.b }, p2/Z, [x20, x27]\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "ld1b { z27.b }, p1/Z, [x22, x26]\n"
-    "ld1b { z21.b }, p1/Z, [x21, x26]\n"
-    "ld1b { z26.b }, p1/Z, [x20, x26]\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "ld1b { z25.b }, p0/Z, [x22, x25]\n"
-    "ld1b { z20.b }, p0/Z, [x21, x25]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+    "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "movprfx z18, z31\n smax z18.b, p4/M, z18.b, z30.b\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "smax z22.b, p4/M, z22.b, z29.b\n"
-    "movprfx z17, z28\n smax z17.b, p4/M, z17.b, z27.b\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "smax z21.b, p4/M, z21.b, z26.b\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "smax z16.b, p4/M, z16.b, z25.b\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
-    "smax z20.b, p4/M, z20.b, z24.b\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "smax z19.b, p4/M, z19.b, z23.b\n"
-    "ld1b { z30.b }, p2/Z, [x22, x27]\n"
-    "smax z18.b, p4/M, z18.b, z22.b\n"
-    "ld1b { z22.b }, p2/Z, [x21, x27]\n"
-    "smax z17.b, p4/M, z17.b, z21.b\n"
-    "ld1b { z29.b }, p2/Z, [x20, x27]\n"
-    "smax z16.b, p4/M, z16.b, z20.b\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "smax z7.b, p4/M, z7.b, z19.b\n"
-    "ld1b { z27.b }, p1/Z, [x22, x26]\n"
-    "smax z6.b, p4/M, z6.b, z18.b\n"
-    "ld1b { z21.b }, p1/Z, [x21, x26]\n"
-    "smax z5.b, p4/M, z5.b, z17.b\n"
-    "ld1b { z26.b }, p1/Z, [x20, x26]\n"
-    "smax z4.b, p4/M, z4.b, z16.b\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "ld1b { z25.b }, p0/Z, [x22, x25]\n"
-    "ld1b { z20.b }, p0/Z, [x21, x25]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+    "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
+    "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
+    "smax z22.b, p0/M, z22.b, z30.b\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
+    "smax z21.b, p0/M, z21.b, z27.b\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+    "movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
+    "smax z20.b, p0/M, z20.b, z24.b\n"
+    "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+    "smax z19.b, p0/M, z19.b, z23.b\n"
+    "smax z18.b, p0/M, z18.b, z22.b\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+    "smax z17.b, p0/M, z17.b, z21.b\n"
+    "smax z16.b, p0/M, z16.b, z20.b\n"
+    "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "smax z8.b, p0/M, z8.b, z19.b\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+    "smax z7.b, p0/M, z7.b, z18.b\n"
+    "smax z6.b, p0/M, z6.b, z17.b\n"
+    "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "smax z5.b, p0/M, z5.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
-    "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
-    "movprfx z18, z31\n smax z18.b, p4/M, z18.b, z30.b\n"
-    "smax z22.b, p4/M, z22.b, z29.b\n"
-    "movprfx z17, z28\n smax z17.b, p4/M, z17.b, z27.b\n"
-    "smax z21.b, p4/M, z21.b, z26.b\n"
-    "smax z16.b, p4/M, z16.b, z25.b\n"
-    "smax z20.b, p4/M, z20.b, z24.b\n"
-    "smax z19.b, p4/M, z19.b, z23.b\n"
-    "smax z18.b, p4/M, z18.b, z22.b\n"
-    "smax z17.b, p4/M, z17.b, z21.b\n"
-    "smax z16.b, p4/M, z16.b, z20.b\n"
-    "smax z7.b, p4/M, z7.b, z19.b\n"
-    "smax z6.b, p4/M, z6.b, z18.b\n"
-    "smax z5.b, p4/M, z5.b, z17.b\n"
-    "smax z4.b, p4/M, z4.b, z16.b\n"
+    "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
+    "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
+    "movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
+    "smax z22.b, p0/M, z22.b, z30.b\n"
+    "movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
+    "smax z21.b, p0/M, z21.b, z27.b\n"
+    "movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
+    "smax z20.b, p0/M, z20.b, z24.b\n"
+    "smax z19.b, p0/M, z19.b, z23.b\n"
+    "smax z18.b, p0/M, z18.b, z22.b\n"
+    "smax z17.b, p0/M, z17.b, z21.b\n"
+    "smax z16.b, p0/M, z16.b, z20.b\n"
+    "smax z8.b, p0/M, z8.b, z19.b\n"
+    "smax z7.b, p0/M, z7.b, z18.b\n"
+    "smax z6.b, p0/M, z6.b, z17.b\n"
+    "smax z5.b, p0/M, z5.b, z16.b\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "smax z7.b, p4/M, z7.b, z3.b\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "smax z6.b, p4/M, z6.b, z31.b\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "smax z5.b, p4/M, z5.b, z28.b\n"
-    "smax z4.b, p4/M, z4.b, z16.b\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "smax z8.b, p0/M, z8.b, z16.b\n"
+    "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+    "smax z7.b, p0/M, z7.b, z17.b\n"
+    "smax z6.b, p0/M, z6.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+    "smax z5.b, p0/M, z5.b, z16.b\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
+    "st1b { z8.b }, p4, [%x[outptr], x9]\n"
+    "incb x9, ALL, MUL #4\n"
     "st1b { z7.b }, p3, [%x[outptr], x28]\n"
     "incb x28, ALL, MUL #4\n"
     "st1b { z6.b }, p2, [%x[outptr], x27]\n"
     "incb x27, ALL, MUL #4\n"
     "st1b { z5.b }, p1, [%x[outptr], x26]\n"
     "incb x26, ALL, MUL #4\n"
-    "st1b { z4.b }, p0, [%x[outptr], x25]\n"
-    "incb x25, ALL, MUL #4\n"
-    "whilelt p0.b, x25, %x[n_channels]\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.b, x28, %x[n_channels]\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov z7.b, #0x80\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.b, #0x80\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "smax z19.b, p4/M, z19.b, z23.b\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "smax z7.b, p4/M, z7.b, z19.b\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+    "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+    "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "smax z16.b, p0/M, z16.b, z17.b\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "smax z8.b, p0/M, z8.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
-    "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
-    "smax z19.b, p4/M, z19.b, z23.b\n"
-    "smax z7.b, p4/M, z7.b, z19.b\n"
+    "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+    "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+    "smax z16.b, p0/M, z16.b, z17.b\n"
+    "smax z8.b, p0/M, z8.b, z16.b\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "smax z7.b, p4/M, z7.b, z3.b\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "smax z8.b, p0/M, z8.b, z16.b\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "st1b { z7.b }, p3, [%x[outptr], x28]\n"
-    "incb x28\n"
-    "whilelt p3.b, x28, %x[n_channels]\n"
+    "st1b { z8.b }, p4, [%x[outptr], x9]\n"
+    "incb x9\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst.hpp
index 1242eaf530..6f34faa121 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_s8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
 
-struct sve_s8q_nhwc_avg_generic_depthfirst
+struct sve_s8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = sve_s8q_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
   sve_s8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_s8q_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
index 928eb412b5..19a3b112ad 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,12 @@
 
 #include "pooling.hpp"
 #include <cstdint>
+#include <cstddef>
 #include <cstring>
 #include <cmath>
 
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -86,12 +87,13 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
       f_rescale_value *= 2.0f;
     }
 
-    rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
-    if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
     {
       shift_value++;
-      rescale_value >>= 1;
+      long_rescale_value >>= 1;
     }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
   }
 
   // Combine together the rescale value for the requantization and the scaling
@@ -112,21 +114,21 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
   );
 
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x26, #0x0\n"
-    "cntb x25\n"
-    "cntb x24, ALL, MUL #2\n"
-    "cntb x23, ALL, MUL #3\n"
+    "mov x27, #0x0\n"
+    "cntb x26\n"
+    "cntb x25, ALL, MUL #2\n"
+    "cntb x24, ALL, MUL #3\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "whilelt p3.b, x26, %x[n_channels]\n"
     "whilelt p2.b, x25, %x[n_channels]\n"
     "whilelt p1.b, x24, %x[n_channels]\n"
-    "whilelt p0.b, x23, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov z15.s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "mov z14.s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "mov z13.s, #0x0\n"
     "mov z12.s, #0x0\n"
     "mov z11.s, #0x0\n"
@@ -141,43 +143,43 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
     "mov z2.s, #0x0\n"
     "mov z1.s, #0x0\n"
     "mov z0.s, #0x0\n"
-    "cbz x22, 4f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
-    "ld1b { z28.b }, p2/Z, [x20, x25]\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
-    "ld1b { z26.b }, p1/Z, [x20, x24]\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 2 inputs loop
     ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
     ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
     ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
-    "subs x22, x22, #0x1\n"
     ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
     ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
     ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
-    "ld1b { z28.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
     ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
     ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
-    "ld1b { z26.b }, p1/Z, [x20, x24]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
-    "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
     ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
     ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
     ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
@@ -217,241 +219,240 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
     ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
     ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x21, %x[n_valid_cells], #0x1\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x4508a3f1  // sshllb z17.h, z31.b, #0x0\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
-    ".inst 0x4508a7f0  // sshllt z16.h, z31.b, #0x0\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
-    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
-    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
-    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
-    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
-    ".inst 0x4508a3b0  // sshllb z16.h, z29.b, #0x0\n"
-    ".inst 0x4590416b  // saddwb z11.s, z11.s, z16.h\n"
-    ".inst 0x4590454a  // saddwt z10.s, z10.s, z16.h\n"
-    ".inst 0x4508a7b0  // sshllt z16.h, z29.b, #0x0\n"
-    ".inst 0x45904129  // saddwb z9.s, z9.s, z16.h\n"
-    ".inst 0x45904508  // saddwt z8.s, z8.s, z16.h\n"
-    ".inst 0x4508a370  // sshllb z16.h, z27.b, #0x0\n"
-    ".inst 0x459040e7  // saddwb z7.s, z7.s, z16.h\n"
-    ".inst 0x459044c6  // saddwt z6.s, z6.s, z16.h\n"
-    ".inst 0x4508a770  // sshllt z16.h, z27.b, #0x0\n"
-    ".inst 0x459040a5  // saddwb z5.s, z5.s, z16.h\n"
-    ".inst 0x45904484  // saddwt z4.s, z4.s, z16.h\n"
-    ".inst 0x4508a330  // sshllb z16.h, z25.b, #0x0\n"
-    ".inst 0x45904063  // saddwb z3.s, z3.s, z16.h\n"
-    ".inst 0x45904442  // saddwt z2.s, z2.s, z16.h\n"
-    ".inst 0x4508a730  // sshllt z16.h, z25.b, #0x0\n"
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508a217  // sshllb z23.h, z16.b, #0x0\n"
+    ".inst 0x4508a616  // sshllt z22.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x4508a215  // sshllb z21.h, z16.b, #0x0\n"
+    ".inst 0x4508a614  // sshllt z20.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x4508a233  // sshllb z19.h, z17.b, #0x0\n"
+    ".inst 0x4508a632  // sshllt z18.h, z17.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0x4508a211  // sshllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508a610  // sshllt z16.h, z16.b, #0x0\n"
+    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
     ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
     ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "mov z20.s, #0x7f\n"
-    "ld1rw { z18.s }, p4/Z, [%x[combined_rescale_value]]\n"
-    "ld1rw { z17.s }, p4/Z, [%x[left_shift]]\n"
-    "not z19.s, p4/M, z20.s\n"
-    "ld1rw { z16.s }, p4/Z, [%x[right_shift]]\n"
-    ".inst 0x4482922f  // srshl z15.s, p4/M, z15.s, z17.s\n"
-    ".inst 0x4482922e  // srshl z14.s, p4/M, z14.s, z17.s\n"
-    ".inst 0x4482922d  // srshl z13.s, p4/M, z13.s, z17.s\n"
-    ".inst 0x4482922c  // srshl z12.s, p4/M, z12.s, z17.s\n"
-    ".inst 0x4482922b  // srshl z11.s, p4/M, z11.s, z17.s\n"
-    ".inst 0x04b275ef  // sqrdmulh z15.s, z15.s, z18.s\n"
-    ".inst 0x04b275ce  // sqrdmulh z14.s, z14.s, z18.s\n"
-    ".inst 0x04b275ad  // sqrdmulh z13.s, z13.s, z18.s\n"
-    ".inst 0x04b2758c  // sqrdmulh z12.s, z12.s, z18.s\n"
-    ".inst 0x04b2756b  // sqrdmulh z11.s, z11.s, z18.s\n"
-    ".inst 0x4482920f  // srshl z15.s, p4/M, z15.s, z16.s\n"
-    ".inst 0x4482920e  // srshl z14.s, p4/M, z14.s, z16.s\n"
-    ".inst 0x4482920d  // srshl z13.s, p4/M, z13.s, z16.s\n"
-    ".inst 0x4482920c  // srshl z12.s, p4/M, z12.s, z16.s\n"
-    ".inst 0x4482920b  // srshl z11.s, p4/M, z11.s, z16.s\n"
-    ".inst 0x4482922a  // srshl z10.s, p4/M, z10.s, z17.s\n"
-    ".inst 0x44829229  // srshl z9.s, p4/M, z9.s, z17.s\n"
-    ".inst 0x44829228  // srshl z8.s, p4/M, z8.s, z17.s\n"
-    ".inst 0x44829227  // srshl z7.s, p4/M, z7.s, z17.s\n"
-    ".inst 0x04b2754a  // sqrdmulh z10.s, z10.s, z18.s\n"
-    ".inst 0x04b27529  // sqrdmulh z9.s, z9.s, z18.s\n"
-    ".inst 0x04b27508  // sqrdmulh z8.s, z8.s, z18.s\n"
-    ".inst 0x04b274e7  // sqrdmulh z7.s, z7.s, z18.s\n"
-    ".inst 0x4482920a  // srshl z10.s, p4/M, z10.s, z16.s\n"
-    ".inst 0x44829209  // srshl z9.s, p4/M, z9.s, z16.s\n"
-    ".inst 0x44829208  // srshl z8.s, p4/M, z8.s, z16.s\n"
-    ".inst 0x44829207  // srshl z7.s, p4/M, z7.s, z16.s\n"
-    ".inst 0x44829226  // srshl z6.s, p4/M, z6.s, z17.s\n"
-    ".inst 0x44829225  // srshl z5.s, p4/M, z5.s, z17.s\n"
-    ".inst 0x44829224  // srshl z4.s, p4/M, z4.s, z17.s\n"
-    ".inst 0x44829223  // srshl z3.s, p4/M, z3.s, z17.s\n"
-    ".inst 0x04b274c6  // sqrdmulh z6.s, z6.s, z18.s\n"
-    ".inst 0x04b274a5  // sqrdmulh z5.s, z5.s, z18.s\n"
-    ".inst 0x04b27484  // sqrdmulh z4.s, z4.s, z18.s\n"
-    ".inst 0x04b27463  // sqrdmulh z3.s, z3.s, z18.s\n"
-    ".inst 0x44829206  // srshl z6.s, p4/M, z6.s, z16.s\n"
-    ".inst 0x44829205  // srshl z5.s, p4/M, z5.s, z16.s\n"
-    ".inst 0x44829204  // srshl z4.s, p4/M, z4.s, z16.s\n"
-    ".inst 0x44829203  // srshl z3.s, p4/M, z3.s, z16.s\n"
-    ".inst 0x44829222  // srshl z2.s, p4/M, z2.s, z17.s\n"
-    ".inst 0x44829221  // srshl z1.s, p4/M, z1.s, z17.s\n"
-    ".inst 0x44829220  // srshl z0.s, p4/M, z0.s, z17.s\n"
-    "smax z15.s, p4/M, z15.s, z19.s\n"
-    ".inst 0x04b27442  // sqrdmulh z2.s, z2.s, z18.s\n"
-    ".inst 0x04b27421  // sqrdmulh z1.s, z1.s, z18.s\n"
-    ".inst 0x04b27400  // sqrdmulh z0.s, z0.s, z18.s\n"
-    "smin z15.s, p4/M, z15.s, z20.s\n"
-    ".inst 0x44829202  // srshl z2.s, p4/M, z2.s, z16.s\n"
-    ".inst 0x44829201  // srshl z1.s, p4/M, z1.s, z16.s\n"
-    ".inst 0x44829200  // srshl z0.s, p4/M, z0.s, z16.s\n"
-    "smax z14.s, p4/M, z14.s, z19.s\n"
-    "smax z13.s, p4/M, z13.s, z19.s\n"
-    "smax z12.s, p4/M, z12.s, z19.s\n"
-    "smax z11.s, p4/M, z11.s, z19.s\n"
-    "smin z14.s, p4/M, z14.s, z20.s\n"
-    "smin z13.s, p4/M, z13.s, z20.s\n"
-    "smin z12.s, p4/M, z12.s, z20.s\n"
-    "smin z11.s, p4/M, z11.s, z20.s\n"
+    "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
+    "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
+    ".inst 0x4482824f  // srshl z15.s, p0/M, z15.s, z18.s\n"
+    ".inst 0x4482824e  // srshl z14.s, p0/M, z14.s, z18.s\n"
+    ".inst 0x4482824d  // srshl z13.s, p0/M, z13.s, z18.s\n"
+    ".inst 0x4482824c  // srshl z12.s, p0/M, z12.s, z18.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
+    ".inst 0x04b175ef  // sqrdmulh z15.s, z15.s, z17.s\n"
+    ".inst 0x4482824b  // srshl z11.s, p0/M, z11.s, z18.s\n"
+    ".inst 0x4482824a  // srshl z10.s, p0/M, z10.s, z18.s\n"
+    ".inst 0x04b175ce  // sqrdmulh z14.s, z14.s, z17.s\n"
+    ".inst 0x04b175ad  // sqrdmulh z13.s, z13.s, z17.s\n"
+    ".inst 0x44828249  // srshl z9.s, p0/M, z9.s, z18.s\n"
+    ".inst 0x44828248  // srshl z8.s, p0/M, z8.s, z18.s\n"
+    ".inst 0x04b1758c  // sqrdmulh z12.s, z12.s, z17.s\n"
+    ".inst 0x04b1756b  // sqrdmulh z11.s, z11.s, z17.s\n"
+    ".inst 0x44828247  // srshl z7.s, p0/M, z7.s, z18.s\n"
+    ".inst 0x44828246  // srshl z6.s, p0/M, z6.s, z18.s\n"
+    ".inst 0x04b1754a  // sqrdmulh z10.s, z10.s, z17.s\n"
+    ".inst 0x04b17529  // sqrdmulh z9.s, z9.s, z17.s\n"
+    ".inst 0x44828245  // srshl z5.s, p0/M, z5.s, z18.s\n"
+    ".inst 0x44828244  // srshl z4.s, p0/M, z4.s, z18.s\n"
+    ".inst 0x04b17508  // sqrdmulh z8.s, z8.s, z17.s\n"
+    ".inst 0x04b174e7  // sqrdmulh z7.s, z7.s, z17.s\n"
+    ".inst 0x44828243  // srshl z3.s, p0/M, z3.s, z18.s\n"
+    ".inst 0x44828242  // srshl z2.s, p0/M, z2.s, z18.s\n"
+    ".inst 0x04b174c6  // sqrdmulh z6.s, z6.s, z17.s\n"
+    ".inst 0x04b174a5  // sqrdmulh z5.s, z5.s, z17.s\n"
+    ".inst 0x44828241  // srshl z1.s, p0/M, z1.s, z18.s\n"
+    ".inst 0x44828240  // srshl z0.s, p0/M, z0.s, z18.s\n"
+    ".inst 0x04b17484  // sqrdmulh z4.s, z4.s, z17.s\n"
+    ".inst 0x04b17463  // sqrdmulh z3.s, z3.s, z17.s\n"
+    ".inst 0x04b17442  // sqrdmulh z2.s, z2.s, z17.s\n"
+    ".inst 0x04b17421  // sqrdmulh z1.s, z1.s, z17.s\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x04b17400  // sqrdmulh z0.s, z0.s, z17.s\n"
+    "mov z18.s, #0x7f\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    ".inst 0x4482820b  // srshl z11.s, p0/M, z11.s, z16.s\n"
+    ".inst 0x4482820a  // srshl z10.s, p0/M, z10.s, z16.s\n"
+    ".inst 0x44828209  // srshl z9.s, p0/M, z9.s, z16.s\n"
+    ".inst 0x44828208  // srshl z8.s, p0/M, z8.s, z16.s\n"
+    ".inst 0x44828207  // srshl z7.s, p0/M, z7.s, z16.s\n"
+    ".inst 0x44828206  // srshl z6.s, p0/M, z6.s, z16.s\n"
+    ".inst 0x44828205  // srshl z5.s, p0/M, z5.s, z16.s\n"
+    ".inst 0x44828204  // srshl z4.s, p0/M, z4.s, z16.s\n"
+    ".inst 0x44828203  // srshl z3.s, p0/M, z3.s, z16.s\n"
+    ".inst 0x44828202  // srshl z2.s, p0/M, z2.s, z16.s\n"
+    ".inst 0x44828201  // srshl z1.s, p0/M, z1.s, z16.s\n"
+    ".inst 0x44828200  // srshl z0.s, p0/M, z0.s, z16.s\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smax z11.s, p0/M, z11.s, z16.s\n"
+    "smax z10.s, p0/M, z10.s, z16.s\n"
+    "smax z9.s, p0/M, z9.s, z16.s\n"
+    "smax z8.s, p0/M, z8.s, z16.s\n"
+    "smax z7.s, p0/M, z7.s, z16.s\n"
+    "smax z6.s, p0/M, z6.s, z16.s\n"
+    "smax z5.s, p0/M, z5.s, z16.s\n"
+    "smax z4.s, p0/M, z4.s, z16.s\n"
+    "smax z3.s, p0/M, z3.s, z16.s\n"
+    "smax z2.s, p0/M, z2.s, z16.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z18.s\n"
+    "smin z14.s, p0/M, z14.s, z18.s\n"
+    "smin z13.s, p0/M, z13.s, z18.s\n"
     "trn1 z17.h, z15.h, z14.h\n"
-    "smax z10.s, p4/M, z10.s, z19.s\n"
+    "smin z12.s, p0/M, z12.s, z18.s\n"
+    "smin z11.s, p0/M, z11.s, z18.s\n"
     "trn1 z16.h, z13.h, z12.h\n"
-    "smax z9.s, p4/M, z9.s, z19.s\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
-    "smin z10.s, p4/M, z10.s, z20.s\n"
-    "incb x26, ALL, MUL #4\n"
-    "smin z9.s, p4/M, z9.s, z20.s\n"
-    "smax z8.s, p4/M, z8.s, z19.s\n"
-    "smax z7.s, p4/M, z7.s, z19.s\n"
-    "smax z6.s, p4/M, z6.s, z19.s\n"
-    "trn1 z18.h, z11.h, z10.h\n"
-    "smin z8.s, p4/M, z8.s, z20.s\n"
-    "smin z7.s, p4/M, z7.s, z20.s\n"
-    "smin z6.s, p4/M, z6.s, z20.s\n"
-    "smax z5.s, p4/M, z5.s, z19.s\n"
+    "smin z10.s, p0/M, z10.s, z18.s\n"
+    "smin z9.s, p0/M, z9.s, z18.s\n"
+    "trn1 z17.h, z11.h, z10.h\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "smin z8.s, p0/M, z8.s, z18.s\n"
+    "smin z7.s, p0/M, z7.s, z18.s\n"
     "trn1 z16.h, z9.h, z8.h\n"
-    "smax z4.s, p4/M, z4.s, z19.s\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "smin z6.s, p0/M, z6.s, z18.s\n"
+    "smin z5.s, p0/M, z5.s, z18.s\n"
     "trn1 z17.h, z7.h, z6.h\n"
-    "trn1 z16.b, z18.b, z16.b\n"
-    "st1b { z16.b }, p2, [%x[outptr], x25]\n"
-    "smin z5.s, p4/M, z5.s, z20.s\n"
-    "incb x25, ALL, MUL #4\n"
-    "smin z4.s, p4/M, z4.s, z20.s\n"
-    "smax z3.s, p4/M, z3.s, z19.s\n"
-    "smax z2.s, p4/M, z2.s, z19.s\n"
-    "smax z1.s, p4/M, z1.s, z19.s\n"
-    "smax z0.s, p4/M, z0.s, z19.s\n"
+    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
+    "smin z4.s, p0/M, z4.s, z18.s\n"
+    "smin z3.s, p0/M, z3.s, z18.s\n"
     "trn1 z16.h, z5.h, z4.h\n"
-    "smin z3.s, p4/M, z3.s, z20.s\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
-    "smin z2.s, p4/M, z2.s, z20.s\n"
-    "incb x24, ALL, MUL #4\n"
-    "smin z1.s, p4/M, z1.s, z20.s\n"
-    "smin z0.s, p4/M, z0.s, z20.s\n"
+    "smin z2.s, p0/M, z2.s, z18.s\n"
+    "smin z1.s, p0/M, z1.s, z18.s\n"
     "trn1 z17.h, z3.h, z2.h\n"
+    "st1b { z16.b }, p2, [%x[outptr], x25]\n"
+    "smin z0.s, p0/M, z0.s, z18.s\n"
     "trn1 z16.h, z1.h, z0.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p0, [%x[outptr], x23]\n"
-    "incb x23, ALL, MUL #4\n"
-    "whilelt p0.b, x23, %x[n_channels]\n"
+    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+    "incb x24, ALL, MUL #4\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "incb x27, ALL, MUL #4\n"
+    "incb x26, ALL, MUL #4\n"
+    "incb x25, ALL, MUL #4\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.b, x26, %x[n_channels]\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov z15.s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "mov z14.s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "mov z13.s, #0x0\n"
     "mov z12.s, #0x0\n"
-    "cbz x22, 11f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 2 inputs loop
-    ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
-    ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
-    "subs x22, x22, #0x1\n"
-    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
-    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x455e03f1  // saddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e07f0  // saddlt z16.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 2 inputs tail
-    ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
-    ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
-    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
-    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
-    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
-    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x455e03f1  // saddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e07f0  // saddlt z16.h, z31.b, z30.b\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x21, %x[n_valid_cells], #0x1\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x4508a3f1  // sshllb z17.h, z31.b, #0x0\n"
-    ".inst 0x4508a7f0  // sshllt z16.h, z31.b, #0x0\n"
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508a211  // sshllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508a610  // sshllt z16.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
     ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
     ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
     ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
     ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "mov z20.s, #0x7f\n"
-    "ld1rw { z18.s }, p4/Z, [%x[combined_rescale_value]]\n"
-    "ld1rw { z17.s }, p4/Z, [%x[left_shift]]\n"
-    "not z19.s, p4/M, z20.s\n"
-    "ld1rw { z16.s }, p4/Z, [%x[right_shift]]\n"
-    ".inst 0x4482922f  // srshl z15.s, p4/M, z15.s, z17.s\n"
-    ".inst 0x4482922e  // srshl z14.s, p4/M, z14.s, z17.s\n"
-    ".inst 0x4482922d  // srshl z13.s, p4/M, z13.s, z17.s\n"
-    ".inst 0x4482922c  // srshl z12.s, p4/M, z12.s, z17.s\n"
-    ".inst 0x04b275ef  // sqrdmulh z15.s, z15.s, z18.s\n"
-    ".inst 0x04b275ce  // sqrdmulh z14.s, z14.s, z18.s\n"
-    ".inst 0x04b275ad  // sqrdmulh z13.s, z13.s, z18.s\n"
-    ".inst 0x04b2758c  // sqrdmulh z12.s, z12.s, z18.s\n"
-    ".inst 0x4482920f  // srshl z15.s, p4/M, z15.s, z16.s\n"
-    ".inst 0x4482920e  // srshl z14.s, p4/M, z14.s, z16.s\n"
-    ".inst 0x4482920d  // srshl z13.s, p4/M, z13.s, z16.s\n"
-    ".inst 0x4482920c  // srshl z12.s, p4/M, z12.s, z16.s\n"
-    "smax z15.s, p4/M, z15.s, z19.s\n"
-    "smax z14.s, p4/M, z14.s, z19.s\n"
-    "smax z13.s, p4/M, z13.s, z19.s\n"
-    "smax z12.s, p4/M, z12.s, z19.s\n"
-    "smin z15.s, p4/M, z15.s, z20.s\n"
-    "smin z14.s, p4/M, z14.s, z20.s\n"
-    "smin z13.s, p4/M, z13.s, z20.s\n"
-    "smin z12.s, p4/M, z12.s, z20.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
+    "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
+    ".inst 0x04b175ef  // sqrdmulh z15.s, z15.s, z17.s\n"
+    ".inst 0x04b175ce  // sqrdmulh z14.s, z14.s, z17.s\n"
+    ".inst 0x04b175ad  // sqrdmulh z13.s, z13.s, z17.s\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x04b1758c  // sqrdmulh z12.s, z12.s, z17.s\n"
+    "mov z18.s, #0x7f\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z18.s\n"
+    "smin z14.s, p0/M, z14.s, z18.s\n"
+    "smin z13.s, p0/M, z13.s, z18.s\n"
     "trn1 z17.h, z15.h, z14.h\n"
+    "smin z12.s, p0/M, z12.s, z18.s\n"
     "trn1 z16.h, z13.h, z12.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
-    "incb x26\n"
-    "whilelt p3.b, x26, %x[n_channels]\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [right_shift] "r" (&right_shift)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst.hpp
index 84aa0d3d6b..fc06ed09f6 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_s8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
 
-struct sve_s8q_nhwc_max_generic_depthfirst
+struct sve_s8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = sve_s8q_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
   sve_s8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_s8q_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
index 3717f8cb30..4fc1532d5a 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,9 @@
 
 #include "pooling.hpp"
 #include <cstdint>
+#include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -41,346 +42,345 @@ void sve_s8q_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x28, #0x0\n"
-    "cntb x27\n"
-    "cntb x26, ALL, MUL #2\n"
-    "cntb x25, ALL, MUL #3\n"
+    "mov x9, #0x0\n"
+    "cntb x28\n"
+    "cntb x27, ALL, MUL #2\n"
+    "cntb x26, ALL, MUL #3\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "whilelt p3.b, x28, %x[n_channels]\n"
     "whilelt p2.b, x27, %x[n_channels]\n"
     "whilelt p1.b, x26, %x[n_channels]\n"
-    "whilelt p0.b, x25, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
     "mov z8.b, #0x80\n"
-    "mov x19, %x[inptrs]\n"
     "mov z7.b, #0x80\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
+    "mov x24, %x[inptrs]\n"
     "mov z6.b, #0x80\n"
     "mov z5.b, #0x80\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "ld1b { z30.b }, p2/Z, [x22, x27]\n"
-    "ld1b { z22.b }, p2/Z, [x21, x27]\n"
-    "ld1b { z29.b }, p2/Z, [x20, x27]\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "ld1b { z27.b }, p1/Z, [x22, x26]\n"
-    "ld1b { z21.b }, p1/Z, [x21, x26]\n"
-    "ld1b { z26.b }, p1/Z, [x20, x26]\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "ld1b { z25.b }, p0/Z, [x22, x25]\n"
-    "ld1b { z20.b }, p0/Z, [x21, x25]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+    "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "movprfx z18, z31\n smax z18.b, p4/M, z18.b, z30.b\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "smax z22.b, p4/M, z22.b, z29.b\n"
-    "movprfx z17, z28\n smax z17.b, p4/M, z17.b, z27.b\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "smax z21.b, p4/M, z21.b, z26.b\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "smax z16.b, p4/M, z16.b, z25.b\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
-    "smax z20.b, p4/M, z20.b, z24.b\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "smax z19.b, p4/M, z19.b, z23.b\n"
-    "ld1b { z30.b }, p2/Z, [x22, x27]\n"
-    "smax z18.b, p4/M, z18.b, z22.b\n"
-    "ld1b { z22.b }, p2/Z, [x21, x27]\n"
-    "smax z17.b, p4/M, z17.b, z21.b\n"
-    "ld1b { z29.b }, p2/Z, [x20, x27]\n"
-    "smax z16.b, p4/M, z16.b, z20.b\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "smax z8.b, p4/M, z8.b, z19.b\n"
-    "ld1b { z27.b }, p1/Z, [x22, x26]\n"
-    "smax z7.b, p4/M, z7.b, z18.b\n"
-    "ld1b { z21.b }, p1/Z, [x21, x26]\n"
-    "smax z6.b, p4/M, z6.b, z17.b\n"
-    "ld1b { z26.b }, p1/Z, [x20, x26]\n"
-    "smax z5.b, p4/M, z5.b, z16.b\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "ld1b { z25.b }, p0/Z, [x22, x25]\n"
-    "ld1b { z20.b }, p0/Z, [x21, x25]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+    "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
+    "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
+    "smax z22.b, p0/M, z22.b, z30.b\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
+    "smax z21.b, p0/M, z21.b, z27.b\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+    "movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
+    "smax z20.b, p0/M, z20.b, z24.b\n"
+    "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+    "smax z19.b, p0/M, z19.b, z23.b\n"
+    "smax z18.b, p0/M, z18.b, z22.b\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+    "smax z17.b, p0/M, z17.b, z21.b\n"
+    "smax z16.b, p0/M, z16.b, z20.b\n"
+    "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "smax z8.b, p0/M, z8.b, z19.b\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+    "smax z7.b, p0/M, z7.b, z18.b\n"
+    "smax z6.b, p0/M, z6.b, z17.b\n"
+    "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "smax z5.b, p0/M, z5.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
-    "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
-    "movprfx z18, z31\n smax z18.b, p4/M, z18.b, z30.b\n"
-    "smax z22.b, p4/M, z22.b, z29.b\n"
-    "movprfx z17, z28\n smax z17.b, p4/M, z17.b, z27.b\n"
-    "smax z21.b, p4/M, z21.b, z26.b\n"
-    "smax z16.b, p4/M, z16.b, z25.b\n"
-    "smax z20.b, p4/M, z20.b, z24.b\n"
-    "smax z19.b, p4/M, z19.b, z23.b\n"
-    "smax z18.b, p4/M, z18.b, z22.b\n"
-    "smax z17.b, p4/M, z17.b, z21.b\n"
-    "smax z16.b, p4/M, z16.b, z20.b\n"
-    "smax z8.b, p4/M, z8.b, z19.b\n"
-    "smax z7.b, p4/M, z7.b, z18.b\n"
-    "smax z6.b, p4/M, z6.b, z17.b\n"
-    "smax z5.b, p4/M, z5.b, z16.b\n"
+    "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
+    "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
+    "movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
+    "smax z22.b, p0/M, z22.b, z30.b\n"
+    "movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
+    "smax z21.b, p0/M, z21.b, z27.b\n"
+    "movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
+    "smax z20.b, p0/M, z20.b, z24.b\n"
+    "smax z19.b, p0/M, z19.b, z23.b\n"
+    "smax z18.b, p0/M, z18.b, z22.b\n"
+    "smax z17.b, p0/M, z17.b, z21.b\n"
+    "smax z16.b, p0/M, z16.b, z20.b\n"
+    "smax z8.b, p0/M, z8.b, z19.b\n"
+    "smax z7.b, p0/M, z7.b, z18.b\n"
+    "smax z6.b, p0/M, z6.b, z17.b\n"
+    "smax z5.b, p0/M, z5.b, z16.b\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "smax z8.b, p4/M, z8.b, z3.b\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "smax z7.b, p4/M, z7.b, z31.b\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "smax z6.b, p4/M, z6.b, z28.b\n"
-    "smax z5.b, p4/M, z5.b, z16.b\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "smax z8.b, p0/M, z8.b, z16.b\n"
+    "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+    "smax z7.b, p0/M, z7.b, z17.b\n"
+    "smax z6.b, p0/M, z6.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+    "smax z5.b, p0/M, z5.b, z16.b\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "mov z4.s, #0x7f\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    "ld1rw { z3.s }, p4/Z, [x19]\n"
     ".inst 0x4508a111  // sshllb z17.h, z8.b, #0x0\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
-    ".inst 0x4508a510  // sshllt z16.h, z8.b, #0x0\n"
-    "ld1rw { z2.s }, p4/Z, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    ".inst 0x4508a0f2  // sshllb z18.h, z7.b, #0x0\n"
-    "ld1rw { z1.s }, p4/Z, [x19]\n"
-    ".inst 0x4508a4f7  // sshllt z23.h, z7.b, #0x0\n"
-    ".inst 0x4508a0d6  // sshllb z22.h, z6.b, #0x0\n"
-    ".inst 0x4508a4d5  // sshllt z21.h, z6.b, #0x0\n"
-    ".inst 0x4508a0b4  // sshllb z20.h, z5.b, #0x0\n"
-    ".inst 0x4508a4b3  // sshllt z19.h, z5.b, #0x0\n"
-    ".inst 0x4510a220  // sshllb z0.s, z17.h, #0x0\n"
+    ".inst 0x4508a517  // sshllt z23.h, z8.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1rw { z4.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a0f6  // sshllb z22.h, z7.b, #0x0\n"
+    ".inst 0x4508a4f5  // sshllt z21.h, z7.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1rw { z3.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a0d4  // sshllb z20.h, z6.b, #0x0\n"
+    ".inst 0x4508a4d3  // sshllt z19.h, z6.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1rw { z2.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a0b2  // sshllb z18.h, z5.b, #0x0\n"
+    ".inst 0x4508a4b0  // sshllt z16.h, z5.b, #0x0\n"
+    ".inst 0x4510a221  // sshllb z1.s, z17.h, #0x0\n"
     ".inst 0x4510a631  // sshllt z17.s, z17.h, #0x0\n"
-    ".inst 0x4510a21f  // sshllb z31.s, z16.h, #0x0\n"
-    ".inst 0x4510a610  // sshllt z16.s, z16.h, #0x0\n"
-    ".inst 0x4510a25e  // sshllb z30.s, z18.h, #0x0\n"
-    ".inst 0x4510a652  // sshllt z18.s, z18.h, #0x0\n"
-    ".inst 0x4510a2fd  // sshllb z29.s, z23.h, #0x0\n"
-    ".inst 0x4510a6fc  // sshllt z28.s, z23.h, #0x0\n"
-    ".inst 0x4510a2db  // sshllb z27.s, z22.h, #0x0\n"
-    ".inst 0x4510a6da  // sshllt z26.s, z22.h, #0x0\n"
-    ".inst 0x4510a2b9  // sshllb z25.s, z21.h, #0x0\n"
-    ".inst 0x4510a6b8  // sshllt z24.s, z21.h, #0x0\n"
-    ".inst 0x4510a297  // sshllb z23.s, z20.h, #0x0\n"
-    ".inst 0x4510a696  // sshllt z22.s, z20.h, #0x0\n"
-    ".inst 0x4510a275  // sshllb z21.s, z19.h, #0x0\n"
-    ".inst 0x4510a674  // sshllt z20.s, z19.h, #0x0\n"
-    ".inst 0x44829040  // srshl z0.s, p4/M, z0.s, z2.s\n"
-    ".inst 0x44829051  // srshl z17.s, p4/M, z17.s, z2.s\n"
-    ".inst 0x4482905f  // srshl z31.s, p4/M, z31.s, z2.s\n"
-    ".inst 0x44829050  // srshl z16.s, p4/M, z16.s, z2.s\n"
-    ".inst 0x04a37400  // sqrdmulh z0.s, z0.s, z3.s\n"
+    ".inst 0x44828081  // srshl z1.s, p0/M, z1.s, z4.s\n"
+    ".inst 0x44828091  // srshl z17.s, p0/M, z17.s, z4.s\n"
+    ".inst 0x4510a2e0  // sshllb z0.s, z23.h, #0x0\n"
+    ".inst 0x4510a6ff  // sshllt z31.s, z23.h, #0x0\n"
+    ".inst 0x44828080  // srshl z0.s, p0/M, z0.s, z4.s\n"
+    ".inst 0x4482809f  // srshl z31.s, p0/M, z31.s, z4.s\n"
+    ".inst 0x4510a2de  // sshllb z30.s, z22.h, #0x0\n"
+    ".inst 0x4510a6dd  // sshllt z29.s, z22.h, #0x0\n"
+    ".inst 0x4482809e  // srshl z30.s, p0/M, z30.s, z4.s\n"
+    ".inst 0x4482809d  // srshl z29.s, p0/M, z29.s, z4.s\n"
+    ".inst 0x4510a2bc  // sshllb z28.s, z21.h, #0x0\n"
+    ".inst 0x4510a6bb  // sshllt z27.s, z21.h, #0x0\n"
+    ".inst 0x4482809c  // srshl z28.s, p0/M, z28.s, z4.s\n"
+    ".inst 0x4482809b  // srshl z27.s, p0/M, z27.s, z4.s\n"
+    ".inst 0x4510a29a  // sshllb z26.s, z20.h, #0x0\n"
+    ".inst 0x4510a699  // sshllt z25.s, z20.h, #0x0\n"
+    ".inst 0x4482809a  // srshl z26.s, p0/M, z26.s, z4.s\n"
+    ".inst 0x44828099  // srshl z25.s, p0/M, z25.s, z4.s\n"
+    ".inst 0x4510a278  // sshllb z24.s, z19.h, #0x0\n"
+    ".inst 0x4510a677  // sshllt z23.s, z19.h, #0x0\n"
+    ".inst 0x44828098  // srshl z24.s, p0/M, z24.s, z4.s\n"
+    ".inst 0x44828097  // srshl z23.s, p0/M, z23.s, z4.s\n"
+    ".inst 0x4510a256  // sshllb z22.s, z18.h, #0x0\n"
+    ".inst 0x4510a655  // sshllt z21.s, z18.h, #0x0\n"
+    ".inst 0x44828096  // srshl z22.s, p0/M, z22.s, z4.s\n"
+    ".inst 0x44828095  // srshl z21.s, p0/M, z21.s, z4.s\n"
+    ".inst 0x4510a214  // sshllb z20.s, z16.h, #0x0\n"
+    ".inst 0x4510a613  // sshllt z19.s, z16.h, #0x0\n"
+    ".inst 0x44828094  // srshl z20.s, p0/M, z20.s, z4.s\n"
+    ".inst 0x44828093  // srshl z19.s, p0/M, z19.s, z4.s\n"
+    ".inst 0x04a37421  // sqrdmulh z1.s, z1.s, z3.s\n"
     ".inst 0x04a37631  // sqrdmulh z17.s, z17.s, z3.s\n"
+    ".inst 0x44828041  // srshl z1.s, p0/M, z1.s, z2.s\n"
+    ".inst 0x44828051  // srshl z17.s, p0/M, z17.s, z2.s\n"
+    ".inst 0x04a37400  // sqrdmulh z0.s, z0.s, z3.s\n"
     ".inst 0x04a377ff  // sqrdmulh z31.s, z31.s, z3.s\n"
-    ".inst 0x04a37610  // sqrdmulh z16.s, z16.s, z3.s\n"
-    ".inst 0x44829020  // srshl z0.s, p4/M, z0.s, z1.s\n"
-    ".inst 0x44829031  // srshl z17.s, p4/M, z17.s, z1.s\n"
-    ".inst 0x4482903f  // srshl z31.s, p4/M, z31.s, z1.s\n"
-    ".inst 0x44829030  // srshl z16.s, p4/M, z16.s, z1.s\n"
-    ".inst 0x4482905e  // srshl z30.s, p4/M, z30.s, z2.s\n"
-    ".inst 0x44829052  // srshl z18.s, p4/M, z18.s, z2.s\n"
-    ".inst 0x4482905d  // srshl z29.s, p4/M, z29.s, z2.s\n"
-    ".inst 0x4482905c  // srshl z28.s, p4/M, z28.s, z2.s\n"
+    ".inst 0x44828040  // srshl z0.s, p0/M, z0.s, z2.s\n"
+    ".inst 0x4482805f  // srshl z31.s, p0/M, z31.s, z2.s\n"
     ".inst 0x04a377de  // sqrdmulh z30.s, z30.s, z3.s\n"
-    ".inst 0x04a37652  // sqrdmulh z18.s, z18.s, z3.s\n"
     ".inst 0x04a377bd  // sqrdmulh z29.s, z29.s, z3.s\n"
+    ".inst 0x4482805e  // srshl z30.s, p0/M, z30.s, z2.s\n"
+    ".inst 0x4482805d  // srshl z29.s, p0/M, z29.s, z2.s\n"
     ".inst 0x04a3779c  // sqrdmulh z28.s, z28.s, z3.s\n"
-    ".inst 0x4482903e  // srshl z30.s, p4/M, z30.s, z1.s\n"
-    ".inst 0x44829032  // srshl z18.s, p4/M, z18.s, z1.s\n"
-    ".inst 0x4482903d  // srshl z29.s, p4/M, z29.s, z1.s\n"
-    ".inst 0x4482903c  // srshl z28.s, p4/M, z28.s, z1.s\n"
-    ".inst 0x4482905b  // srshl z27.s, p4/M, z27.s, z2.s\n"
-    ".inst 0x4482905a  // srshl z26.s, p4/M, z26.s, z2.s\n"
-    ".inst 0x44829059  // srshl z25.s, p4/M, z25.s, z2.s\n"
-    ".inst 0x44829058  // srshl z24.s, p4/M, z24.s, z2.s\n"
     ".inst 0x04a3777b  // sqrdmulh z27.s, z27.s, z3.s\n"
+    ".inst 0x4482805c  // srshl z28.s, p0/M, z28.s, z2.s\n"
+    ".inst 0x4482805b  // srshl z27.s, p0/M, z27.s, z2.s\n"
     ".inst 0x04a3775a  // sqrdmulh z26.s, z26.s, z3.s\n"
     ".inst 0x04a37739  // sqrdmulh z25.s, z25.s, z3.s\n"
+    ".inst 0x4482805a  // srshl z26.s, p0/M, z26.s, z2.s\n"
+    ".inst 0x44828059  // srshl z25.s, p0/M, z25.s, z2.s\n"
     ".inst 0x04a37718  // sqrdmulh z24.s, z24.s, z3.s\n"
-    ".inst 0x4482903b  // srshl z27.s, p4/M, z27.s, z1.s\n"
-    ".inst 0x4482903a  // srshl z26.s, p4/M, z26.s, z1.s\n"
-    ".inst 0x44829039  // srshl z25.s, p4/M, z25.s, z1.s\n"
-    ".inst 0x44829038  // srshl z24.s, p4/M, z24.s, z1.s\n"
-    ".inst 0x44829057  // srshl z23.s, p4/M, z23.s, z2.s\n"
-    ".inst 0x44829056  // srshl z22.s, p4/M, z22.s, z2.s\n"
-    ".inst 0x44829055  // srshl z21.s, p4/M, z21.s, z2.s\n"
-    ".inst 0x44829054  // srshl z20.s, p4/M, z20.s, z2.s\n"
     ".inst 0x04a376f7  // sqrdmulh z23.s, z23.s, z3.s\n"
+    ".inst 0x44828058  // srshl z24.s, p0/M, z24.s, z2.s\n"
+    ".inst 0x44828057  // srshl z23.s, p0/M, z23.s, z2.s\n"
     ".inst 0x04a376d6  // sqrdmulh z22.s, z22.s, z3.s\n"
     ".inst 0x04a376b5  // sqrdmulh z21.s, z21.s, z3.s\n"
+    ".inst 0x44828056  // srshl z22.s, p0/M, z22.s, z2.s\n"
+    ".inst 0x44828055  // srshl z21.s, p0/M, z21.s, z2.s\n"
     ".inst 0x04a37694  // sqrdmulh z20.s, z20.s, z3.s\n"
-    ".inst 0x44829037  // srshl z23.s, p4/M, z23.s, z1.s\n"
-    ".inst 0x44829036  // srshl z22.s, p4/M, z22.s, z1.s\n"
-    ".inst 0x44829035  // srshl z21.s, p4/M, z21.s, z1.s\n"
-    ".inst 0x44829034  // srshl z20.s, p4/M, z20.s, z1.s\n"
-    "not z19.s, p4/M, z4.s\n"
-    "smax z0.s, p4/M, z0.s, z19.s\n"
-    "smax z17.s, p4/M, z17.s, z19.s\n"
-    "smax z31.s, p4/M, z31.s, z19.s\n"
-    "smax z16.s, p4/M, z16.s, z19.s\n"
-    "smin z0.s, p4/M, z0.s, z4.s\n"
-    "smin z17.s, p4/M, z17.s, z4.s\n"
-    "smin z31.s, p4/M, z31.s, z4.s\n"
-    "smin z16.s, p4/M, z16.s, z4.s\n"
-    "smax z30.s, p4/M, z30.s, z19.s\n"
-    "trn1 z17.h, z0.h, z17.h\n"
-    "smax z18.s, p4/M, z18.s, z19.s\n"
-    "trn1 z16.h, z31.h, z16.h\n"
-    "smin z30.s, p4/M, z30.s, z4.s\n"
+    ".inst 0x04a37673  // sqrdmulh z19.s, z19.s, z3.s\n"
+    ".inst 0x44828054  // srshl z20.s, p0/M, z20.s, z2.s\n"
+    ".inst 0x44828053  // srshl z19.s, p0/M, z19.s, z2.s\n"
+    "mov z18.s, #0x7f\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z17.s, p0/M, z17.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smax z31.s, p0/M, z31.s, z16.s\n"
+    "smax z30.s, p0/M, z30.s, z16.s\n"
+    "smax z29.s, p0/M, z29.s, z16.s\n"
+    "smax z28.s, p0/M, z28.s, z16.s\n"
+    "smax z27.s, p0/M, z27.s, z16.s\n"
+    "smax z26.s, p0/M, z26.s, z16.s\n"
+    "smax z25.s, p0/M, z25.s, z16.s\n"
+    "smax z24.s, p0/M, z24.s, z16.s\n"
+    "smax z23.s, p0/M, z23.s, z16.s\n"
+    "smax z22.s, p0/M, z22.s, z16.s\n"
+    "smax z21.s, p0/M, z21.s, z16.s\n"
+    "smax z20.s, p0/M, z20.s, z16.s\n"
+    "smax z19.s, p0/M, z19.s, z16.s\n"
+    "smin z1.s, p0/M, z1.s, z18.s\n"
+    "smin z17.s, p0/M, z17.s, z18.s\n"
+    "smin z0.s, p0/M, z0.s, z18.s\n"
+    "trn1 z17.h, z1.h, z17.h\n"
+    "smin z31.s, p0/M, z31.s, z18.s\n"
+    "smin z30.s, p0/M, z30.s, z18.s\n"
+    "trn1 z16.h, z0.h, z31.h\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "smin z29.s, p0/M, z29.s, z18.s\n"
+    "smin z28.s, p0/M, z28.s, z18.s\n"
+    "trn1 z17.h, z30.h, z29.h\n"
+    "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+    "smin z27.s, p0/M, z27.s, z18.s\n"
+    "smin z26.s, p0/M, z26.s, z18.s\n"
+    "trn1 z16.h, z28.h, z27.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
+    "smin z25.s, p0/M, z25.s, z18.s\n"
+    "smin z24.s, p0/M, z24.s, z18.s\n"
+    "trn1 z17.h, z26.h, z25.h\n"
     "st1b { z16.b }, p3, [%x[outptr], x28]\n"
-    "smin z18.s, p4/M, z18.s, z4.s\n"
-    "incb x28, ALL, MUL #4\n"
-    "smax z29.s, p4/M, z29.s, z19.s\n"
-    "smax z28.s, p4/M, z28.s, z19.s\n"
-    "smax z27.s, p4/M, z27.s, z19.s\n"
-    "smax z26.s, p4/M, z26.s, z19.s\n"
-    "trn1 z18.h, z30.h, z18.h\n"
-    "smin z29.s, p4/M, z29.s, z4.s\n"
-    "smin z28.s, p4/M, z28.s, z4.s\n"
-    "smin z27.s, p4/M, z27.s, z4.s\n"
-    "smin z26.s, p4/M, z26.s, z4.s\n"
-    "smax z25.s, p4/M, z25.s, z19.s\n"
-    "trn1 z16.h, z29.h, z28.h\n"
-    "smax z24.s, p4/M, z24.s, z19.s\n"
-    "trn1 z17.h, z27.h, z26.h\n"
-    "trn1 z16.b, z18.b, z16.b\n"
+    "smin z23.s, p0/M, z23.s, z18.s\n"
+    "smin z22.s, p0/M, z22.s, z18.s\n"
+    "trn1 z16.h, z24.h, z23.h\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "smin z21.s, p0/M, z21.s, z18.s\n"
+    "smin z20.s, p0/M, z20.s, z18.s\n"
+    "trn1 z17.h, z22.h, z21.h\n"
     "st1b { z16.b }, p2, [%x[outptr], x27]\n"
-    "smin z25.s, p4/M, z25.s, z4.s\n"
-    "incb x27, ALL, MUL #4\n"
-    "smin z24.s, p4/M, z24.s, z4.s\n"
-    "smax z23.s, p4/M, z23.s, z19.s\n"
-    "smax z22.s, p4/M, z22.s, z19.s\n"
-    "smax z21.s, p4/M, z21.s, z19.s\n"
-    "smax z20.s, p4/M, z20.s, z19.s\n"
-    "trn1 z16.h, z25.h, z24.h\n"
-    "smin z23.s, p4/M, z23.s, z4.s\n"
+    "smin z19.s, p0/M, z19.s, z18.s\n"
+    "trn1 z16.h, z20.h, z19.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
     "st1b { z16.b }, p1, [%x[outptr], x26]\n"
-    "smin z22.s, p4/M, z22.s, z4.s\n"
     "incb x26, ALL, MUL #4\n"
-    "smin z21.s, p4/M, z21.s, z4.s\n"
-    "smin z20.s, p4/M, z20.s, z4.s\n"
-    "trn1 z17.h, z23.h, z22.h\n"
-    "trn1 z16.h, z21.h, z20.h\n"
-    "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p0, [%x[outptr], x25]\n"
-    "incb x25, ALL, MUL #4\n"
-    "whilelt p0.b, x25, %x[n_channels]\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "incb x9, ALL, MUL #4\n"
+    "incb x28, ALL, MUL #4\n"
+    "incb x27, ALL, MUL #4\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.b, x28, %x[n_channels]\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
     "mov z8.b, #0x80\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "smax z19.b, p4/M, z19.b, z23.b\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "smax z8.b, p4/M, z8.b, z19.b\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+    "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+    "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "smax z16.b, p0/M, z16.b, z17.b\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "smax z8.b, p0/M, z8.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
-    "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
-    "smax z19.b, p4/M, z19.b, z23.b\n"
-    "smax z8.b, p4/M, z8.b, z19.b\n"
+    "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+    "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+    "smax z16.b, p0/M, z16.b, z17.b\n"
+    "smax z8.b, p0/M, z8.b, z16.b\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "smax z8.b, p4/M, z8.b, z3.b\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "smax z8.b, p0/M, z8.b, z16.b\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "mov z4.s, #0x7f\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    "ld1rw { z3.s }, p4/Z, [x19]\n"
     ".inst 0x4508a111  // sshllb z17.h, z8.b, #0x0\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
-    ".inst 0x4508a510  // sshllt z16.h, z8.b, #0x0\n"
-    "ld1rw { z2.s }, p4/Z, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    ".inst 0x4510a220  // sshllb z0.s, z17.h, #0x0\n"
-    "ld1rw { z1.s }, p4/Z, [x19]\n"
-    ".inst 0x4510a631  // sshllt z17.s, z17.h, #0x0\n"
-    ".inst 0x4510a21f  // sshllb z31.s, z16.h, #0x0\n"
-    ".inst 0x4510a610  // sshllt z16.s, z16.h, #0x0\n"
-    ".inst 0x44829040  // srshl z0.s, p4/M, z0.s, z2.s\n"
-    ".inst 0x44829051  // srshl z17.s, p4/M, z17.s, z2.s\n"
-    ".inst 0x4482905f  // srshl z31.s, p4/M, z31.s, z2.s\n"
-    ".inst 0x44829050  // srshl z16.s, p4/M, z16.s, z2.s\n"
-    ".inst 0x04a37400  // sqrdmulh z0.s, z0.s, z3.s\n"
-    ".inst 0x04a37631  // sqrdmulh z17.s, z17.s, z3.s\n"
-    ".inst 0x04a377ff  // sqrdmulh z31.s, z31.s, z3.s\n"
-    ".inst 0x04a37610  // sqrdmulh z16.s, z16.s, z3.s\n"
-    ".inst 0x44829020  // srshl z0.s, p4/M, z0.s, z1.s\n"
-    ".inst 0x44829031  // srshl z17.s, p4/M, z17.s, z1.s\n"
-    ".inst 0x4482903f  // srshl z31.s, p4/M, z31.s, z1.s\n"
-    ".inst 0x44829030  // srshl z16.s, p4/M, z16.s, z1.s\n"
-    "not z19.s, p4/M, z4.s\n"
-    "smax z0.s, p4/M, z0.s, z19.s\n"
-    "smax z17.s, p4/M, z17.s, z19.s\n"
-    "smax z31.s, p4/M, z31.s, z19.s\n"
-    "smax z16.s, p4/M, z16.s, z19.s\n"
-    "smin z0.s, p4/M, z0.s, z4.s\n"
-    "smin z17.s, p4/M, z17.s, z4.s\n"
-    "smin z31.s, p4/M, z31.s, z4.s\n"
-    "smin z16.s, p4/M, z16.s, z4.s\n"
-    "trn1 z17.h, z0.h, z17.h\n"
-    "trn1 z16.h, z31.h, z16.h\n"
+    ".inst 0x4508a512  // sshllt z18.h, z8.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x4510a236  // sshllb z22.s, z17.h, #0x0\n"
+    ".inst 0x4510a635  // sshllt z21.s, z17.h, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1rw { z17.s }, p0/Z, [x20]\n"
+    ".inst 0x4510a254  // sshllb z20.s, z18.h, #0x0\n"
+    ".inst 0x4510a653  // sshllt z19.s, z18.h, #0x0\n"
+    ".inst 0x44828216  // srshl z22.s, p0/M, z22.s, z16.s\n"
+    ".inst 0x44828215  // srshl z21.s, p0/M, z21.s, z16.s\n"
+    ".inst 0x44828214  // srshl z20.s, p0/M, z20.s, z16.s\n"
+    ".inst 0x44828213  // srshl z19.s, p0/M, z19.s, z16.s\n"
+    ".inst 0x04b176d6  // sqrdmulh z22.s, z22.s, z17.s\n"
+    ".inst 0x04b176b5  // sqrdmulh z21.s, z21.s, z17.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x04b17694  // sqrdmulh z20.s, z20.s, z17.s\n"
+    ".inst 0x04b17673  // sqrdmulh z19.s, z19.s, z17.s\n"
+    "mov z18.s, #0x7f\n"
+    ".inst 0x44828216  // srshl z22.s, p0/M, z22.s, z16.s\n"
+    ".inst 0x44828215  // srshl z21.s, p0/M, z21.s, z16.s\n"
+    ".inst 0x44828214  // srshl z20.s, p0/M, z20.s, z16.s\n"
+    ".inst 0x44828213  // srshl z19.s, p0/M, z19.s, z16.s\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z22.s, p0/M, z22.s, z16.s\n"
+    "smax z21.s, p0/M, z21.s, z16.s\n"
+    "smax z20.s, p0/M, z20.s, z16.s\n"
+    "smax z19.s, p0/M, z19.s, z16.s\n"
+    "smin z22.s, p0/M, z22.s, z18.s\n"
+    "smin z21.s, p0/M, z21.s, z18.s\n"
+    "smin z20.s, p0/M, z20.s, z18.s\n"
+    "trn1 z17.h, z22.h, z21.h\n"
+    "smin z19.s, p0/M, z19.s, z18.s\n"
+    "trn1 z16.h, z20.h, z19.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x28]\n"
-    "incb x28\n"
-    "whilelt p3.b, x28, %x[n_channels]\n"
+    "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+    "incb x9\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [outptr] "r" (outptr), [quant_params] "r" (&qp)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst.hpp
index 299e55c9be..714530bc43 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_u8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
 
-struct sve_u8_nhwc_avg_generic_depthfirst
+struct sve_u8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = sve_u8_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
   sve_u8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_u8_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
index 51a69a42be..f3f4950a1f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,11 +23,12 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 #include <cstring>
 #include <cmath>
 
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -84,30 +85,31 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
       f_rescale_value *= 2.0f;
     }
 
-    rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
-    if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
     {
       shift_value++;
-      rescale_value >>= 1;
+      long_rescale_value >>= 1;
     }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
   }
 
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x26, #0x0\n"
-    "cntb x25\n"
-    "cntb x24, ALL, MUL #2\n"
-    "cntb x23, ALL, MUL #3\n"
+    "mov x27, #0x0\n"
+    "cntb x26\n"
+    "cntb x25, ALL, MUL #2\n"
+    "cntb x24, ALL, MUL #3\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "whilelt p3.b, x26, %x[n_channels]\n"
     "whilelt p2.b, x25, %x[n_channels]\n"
     "whilelt p1.b, x24, %x[n_channels]\n"
-    "whilelt p0.b, x23, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov z15.s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "mov z14.s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "mov z13.s, #0x0\n"
     "mov z12.s, #0x0\n"
     "mov z11.s, #0x0\n"
@@ -122,43 +124,43 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
     "mov z2.s, #0x0\n"
     "mov z1.s, #0x0\n"
     "mov z0.s, #0x0\n"
-    "cbz x22, 4f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
-    "ld1b { z28.b }, p2/Z, [x20, x25]\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
-    "ld1b { z26.b }, p1/Z, [x20, x24]\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 2 inputs loop
     ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
     ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
     ".inst 0x455c0bb5  // uaddlb z21.h, z29.b, z28.b\n"
-    "subs x22, x22, #0x1\n"
     ".inst 0x455c0fb4  // uaddlt z20.h, z29.b, z28.b\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
     ".inst 0x455a0b73  // uaddlb z19.h, z27.b, z26.b\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
     ".inst 0x455a0f72  // uaddlt z18.h, z27.b, z26.b\n"
-    "ld1b { z28.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
     ".inst 0x45580b31  // uaddlb z17.h, z25.b, z24.b\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
     ".inst 0x45580f30  // uaddlt z16.h, z25.b, z24.b\n"
-    "ld1b { z26.b }, p1/Z, [x20, x24]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
     ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
     ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
-    "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
     ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
     ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
     ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
     ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
     ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
@@ -198,219 +200,218 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
     ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
     ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x21, %x[n_valid_cells], #0x1\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x4508abf1  // ushllb z17.h, z31.b, #0x0\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
-    ".inst 0x4508aff0  // ushllt z16.h, z31.b, #0x0\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
-    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
-    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
-    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
-    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
-    ".inst 0x4508abb0  // ushllb z16.h, z29.b, #0x0\n"
-    ".inst 0x4590496b  // uaddwb z11.s, z11.s, z16.h\n"
-    ".inst 0x45904d4a  // uaddwt z10.s, z10.s, z16.h\n"
-    ".inst 0x4508afb0  // ushllt z16.h, z29.b, #0x0\n"
-    ".inst 0x45904929  // uaddwb z9.s, z9.s, z16.h\n"
-    ".inst 0x45904d08  // uaddwt z8.s, z8.s, z16.h\n"
-    ".inst 0x4508ab70  // ushllb z16.h, z27.b, #0x0\n"
-    ".inst 0x459048e7  // uaddwb z7.s, z7.s, z16.h\n"
-    ".inst 0x45904cc6  // uaddwt z6.s, z6.s, z16.h\n"
-    ".inst 0x4508af70  // ushllt z16.h, z27.b, #0x0\n"
-    ".inst 0x459048a5  // uaddwb z5.s, z5.s, z16.h\n"
-    ".inst 0x45904c84  // uaddwt z4.s, z4.s, z16.h\n"
-    ".inst 0x4508ab30  // ushllb z16.h, z25.b, #0x0\n"
-    ".inst 0x45904863  // uaddwb z3.s, z3.s, z16.h\n"
-    ".inst 0x45904c42  // uaddwt z2.s, z2.s, z16.h\n"
-    ".inst 0x4508af30  // ushllt z16.h, z25.b, #0x0\n"
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508aa17  // ushllb z23.h, z16.b, #0x0\n"
+    ".inst 0x4508ae16  // ushllt z22.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x4508aa15  // ushllb z21.h, z16.b, #0x0\n"
+    ".inst 0x4508ae14  // ushllt z20.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x4508aa33  // ushllb z19.h, z17.b, #0x0\n"
+    ".inst 0x4508ae32  // ushllt z18.h, z17.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0x4508aa11  // ushllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508ae10  // ushllt z16.h, z16.b, #0x0\n"
+    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944d08  // uaddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459348e7  // uaddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x45934cc6  // uaddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459248a5  // uaddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924c84  // uaddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914863  // uaddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914c42  // uaddwt z2.s, z2.s, z17.h\n"
     ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
     ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "mov z20.s, #0x0\n"
-    "ld1rw { z17.s }, p4/Z, [%x[rescale_ptr]]\n"
-    "mov z19.s, #0xff\n"
-    "ld1rw { z16.s }, p4/Z, [%x[shift_ptr]]\n"
+    "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+    "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
     ".inst 0x04b175ef  // sqdmulh z15.s, z15.s, z17.s\n"
     ".inst 0x04b175ce  // sqdmulh z14.s, z14.s, z17.s\n"
     ".inst 0x04b175ad  // sqdmulh z13.s, z13.s, z17.s\n"
     ".inst 0x04b1758c  // sqdmulh z12.s, z12.s, z17.s\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
     ".inst 0x04b1756b  // sqdmulh z11.s, z11.s, z17.s\n"
     ".inst 0x04b1754a  // sqdmulh z10.s, z10.s, z17.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
     ".inst 0x04b17529  // sqdmulh z9.s, z9.s, z17.s\n"
     ".inst 0x04b17508  // sqdmulh z8.s, z8.s, z17.s\n"
+    ".inst 0x4482820b  // srshl z11.s, p0/M, z11.s, z16.s\n"
+    ".inst 0x4482820a  // srshl z10.s, p0/M, z10.s, z16.s\n"
     ".inst 0x04b174e7  // sqdmulh z7.s, z7.s, z17.s\n"
     ".inst 0x04b174c6  // sqdmulh z6.s, z6.s, z17.s\n"
+    ".inst 0x44828209  // srshl z9.s, p0/M, z9.s, z16.s\n"
+    ".inst 0x44828208  // srshl z8.s, p0/M, z8.s, z16.s\n"
     ".inst 0x04b174a5  // sqdmulh z5.s, z5.s, z17.s\n"
     ".inst 0x04b17484  // sqdmulh z4.s, z4.s, z17.s\n"
+    ".inst 0x44828207  // srshl z7.s, p0/M, z7.s, z16.s\n"
+    ".inst 0x44828206  // srshl z6.s, p0/M, z6.s, z16.s\n"
     ".inst 0x04b17463  // sqdmulh z3.s, z3.s, z17.s\n"
     ".inst 0x04b17442  // sqdmulh z2.s, z2.s, z17.s\n"
+    ".inst 0x44828205  // srshl z5.s, p0/M, z5.s, z16.s\n"
+    ".inst 0x44828204  // srshl z4.s, p0/M, z4.s, z16.s\n"
     ".inst 0x04b17421  // sqdmulh z1.s, z1.s, z17.s\n"
     ".inst 0x04b17400  // sqdmulh z0.s, z0.s, z17.s\n"
-    ".inst 0x4482920f  // srshl z15.s, p4/M, z15.s, z16.s\n"
-    ".inst 0x4482920e  // srshl z14.s, p4/M, z14.s, z16.s\n"
-    ".inst 0x4482920d  // srshl z13.s, p4/M, z13.s, z16.s\n"
-    ".inst 0x4482920c  // srshl z12.s, p4/M, z12.s, z16.s\n"
-    ".inst 0x4482920b  // srshl z11.s, p4/M, z11.s, z16.s\n"
-    ".inst 0x4482920a  // srshl z10.s, p4/M, z10.s, z16.s\n"
-    ".inst 0x44829209  // srshl z9.s, p4/M, z9.s, z16.s\n"
-    ".inst 0x44829208  // srshl z8.s, p4/M, z8.s, z16.s\n"
-    ".inst 0x44829207  // srshl z7.s, p4/M, z7.s, z16.s\n"
-    ".inst 0x44829206  // srshl z6.s, p4/M, z6.s, z16.s\n"
-    ".inst 0x44829205  // srshl z5.s, p4/M, z5.s, z16.s\n"
-    ".inst 0x44829204  // srshl z4.s, p4/M, z4.s, z16.s\n"
-    ".inst 0x44829203  // srshl z3.s, p4/M, z3.s, z16.s\n"
-    ".inst 0x44829202  // srshl z2.s, p4/M, z2.s, z16.s\n"
-    ".inst 0x44829201  // srshl z1.s, p4/M, z1.s, z16.s\n"
-    ".inst 0x44829200  // srshl z0.s, p4/M, z0.s, z16.s\n"
-    "smax z15.s, p4/M, z15.s, z20.s\n"
-    "smax z14.s, p4/M, z14.s, z20.s\n"
-    "smax z13.s, p4/M, z13.s, z20.s\n"
-    "smax z12.s, p4/M, z12.s, z20.s\n"
-    "smin z15.s, p4/M, z15.s, z19.s\n"
-    "smin z14.s, p4/M, z14.s, z19.s\n"
-    "smin z13.s, p4/M, z13.s, z19.s\n"
-    "smin z12.s, p4/M, z12.s, z19.s\n"
-    "smax z11.s, p4/M, z11.s, z20.s\n"
+    ".inst 0x44828203  // srshl z3.s, p0/M, z3.s, z16.s\n"
+    ".inst 0x44828202  // srshl z2.s, p0/M, z2.s, z16.s\n"
+    ".inst 0x44828201  // srshl z1.s, p0/M, z1.s, z16.s\n"
+    ".inst 0x44828200  // srshl z0.s, p0/M, z0.s, z16.s\n"
+    "mov z16.s, #0x0\n"
+    "mov z18.s, #0xff\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smax z11.s, p0/M, z11.s, z16.s\n"
+    "smax z10.s, p0/M, z10.s, z16.s\n"
+    "smax z9.s, p0/M, z9.s, z16.s\n"
+    "smax z8.s, p0/M, z8.s, z16.s\n"
+    "smax z7.s, p0/M, z7.s, z16.s\n"
+    "smax z6.s, p0/M, z6.s, z16.s\n"
+    "smax z5.s, p0/M, z5.s, z16.s\n"
+    "smax z4.s, p0/M, z4.s, z16.s\n"
+    "smax z3.s, p0/M, z3.s, z16.s\n"
+    "smax z2.s, p0/M, z2.s, z16.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z18.s\n"
+    "smin z14.s, p0/M, z14.s, z18.s\n"
     "trn1 z17.h, z15.h, z14.h\n"
-    "smax z10.s, p4/M, z10.s, z20.s\n"
+    "smin z13.s, p0/M, z13.s, z18.s\n"
+    "smin z12.s, p0/M, z12.s, z18.s\n"
     "trn1 z16.h, z13.h, z12.h\n"
-    "smin z11.s, p4/M, z11.s, z19.s\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
-    "smin z10.s, p4/M, z10.s, z19.s\n"
-    "incb x26, ALL, MUL #4\n"
-    "smax z9.s, p4/M, z9.s, z20.s\n"
-    "smax z8.s, p4/M, z8.s, z20.s\n"
-    "smax z7.s, p4/M, z7.s, z20.s\n"
-    "smax z6.s, p4/M, z6.s, z20.s\n"
-    "trn1 z18.h, z11.h, z10.h\n"
-    "smin z9.s, p4/M, z9.s, z19.s\n"
-    "smin z8.s, p4/M, z8.s, z19.s\n"
-    "smin z7.s, p4/M, z7.s, z19.s\n"
-    "smin z6.s, p4/M, z6.s, z19.s\n"
-    "smax z5.s, p4/M, z5.s, z20.s\n"
+    "smin z11.s, p0/M, z11.s, z18.s\n"
+    "smin z10.s, p0/M, z10.s, z18.s\n"
+    "trn1 z17.h, z11.h, z10.h\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "smin z9.s, p0/M, z9.s, z18.s\n"
+    "smin z8.s, p0/M, z8.s, z18.s\n"
     "trn1 z16.h, z9.h, z8.h\n"
-    "smax z4.s, p4/M, z4.s, z20.s\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "smin z7.s, p0/M, z7.s, z18.s\n"
+    "smin z6.s, p0/M, z6.s, z18.s\n"
     "trn1 z17.h, z7.h, z6.h\n"
-    "trn1 z16.b, z18.b, z16.b\n"
-    "st1b { z16.b }, p2, [%x[outptr], x25]\n"
-    "smin z5.s, p4/M, z5.s, z19.s\n"
-    "incb x25, ALL, MUL #4\n"
-    "smin z4.s, p4/M, z4.s, z19.s\n"
-    "smax z3.s, p4/M, z3.s, z20.s\n"
-    "smax z2.s, p4/M, z2.s, z20.s\n"
-    "smax z1.s, p4/M, z1.s, z20.s\n"
-    "smax z0.s, p4/M, z0.s, z20.s\n"
+    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
+    "smin z5.s, p0/M, z5.s, z18.s\n"
+    "smin z4.s, p0/M, z4.s, z18.s\n"
     "trn1 z16.h, z5.h, z4.h\n"
-    "smin z3.s, p4/M, z3.s, z19.s\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
-    "smin z2.s, p4/M, z2.s, z19.s\n"
-    "incb x24, ALL, MUL #4\n"
-    "smin z1.s, p4/M, z1.s, z19.s\n"
-    "smin z0.s, p4/M, z0.s, z19.s\n"
+    "smin z3.s, p0/M, z3.s, z18.s\n"
+    "smin z2.s, p0/M, z2.s, z18.s\n"
     "trn1 z17.h, z3.h, z2.h\n"
+    "st1b { z16.b }, p2, [%x[outptr], x25]\n"
+    "smin z1.s, p0/M, z1.s, z18.s\n"
+    "smin z0.s, p0/M, z0.s, z18.s\n"
     "trn1 z16.h, z1.h, z0.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p0, [%x[outptr], x23]\n"
-    "incb x23, ALL, MUL #4\n"
-    "whilelt p0.b, x23, %x[n_channels]\n"
+    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+    "incb x24, ALL, MUL #4\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "incb x27, ALL, MUL #4\n"
+    "incb x26, ALL, MUL #4\n"
+    "incb x25, ALL, MUL #4\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.b, x26, %x[n_channels]\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov z15.s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "mov z14.s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "mov z13.s, #0x0\n"
     "mov z12.s, #0x0\n"
-    "cbz x22, 11f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 2 inputs loop
-    ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
-    ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
-    "subs x22, x22, #0x1\n"
-    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
-    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x455e0bf1  // uaddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff0  // uaddlt z16.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 2 inputs tail
-    ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
-    ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
-    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
-    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
-    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
-    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x455e0bf1  // uaddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff0  // uaddlt z16.h, z31.b, z30.b\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x21, %x[n_valid_cells], #0x1\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x4508abf1  // ushllb z17.h, z31.b, #0x0\n"
-    ".inst 0x4508aff0  // ushllt z16.h, z31.b, #0x0\n"
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508aa11  // ushllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508ae10  // ushllt z16.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
     ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
     ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
     ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
     ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "mov z20.s, #0x0\n"
-    "ld1rw { z17.s }, p4/Z, [%x[rescale_ptr]]\n"
-    "mov z19.s, #0xff\n"
-    "ld1rw { z16.s }, p4/Z, [%x[shift_ptr]]\n"
+    "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+    "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
     ".inst 0x04b175ef  // sqdmulh z15.s, z15.s, z17.s\n"
     ".inst 0x04b175ce  // sqdmulh z14.s, z14.s, z17.s\n"
     ".inst 0x04b175ad  // sqdmulh z13.s, z13.s, z17.s\n"
     ".inst 0x04b1758c  // sqdmulh z12.s, z12.s, z17.s\n"
-    ".inst 0x4482920f  // srshl z15.s, p4/M, z15.s, z16.s\n"
-    ".inst 0x4482920e  // srshl z14.s, p4/M, z14.s, z16.s\n"
-    ".inst 0x4482920d  // srshl z13.s, p4/M, z13.s, z16.s\n"
-    ".inst 0x4482920c  // srshl z12.s, p4/M, z12.s, z16.s\n"
-    "smax z15.s, p4/M, z15.s, z20.s\n"
-    "smax z14.s, p4/M, z14.s, z20.s\n"
-    "smax z13.s, p4/M, z13.s, z20.s\n"
-    "smax z12.s, p4/M, z12.s, z20.s\n"
-    "smin z15.s, p4/M, z15.s, z19.s\n"
-    "smin z14.s, p4/M, z14.s, z19.s\n"
-    "smin z13.s, p4/M, z13.s, z19.s\n"
-    "smin z12.s, p4/M, z12.s, z19.s\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0xff\n"
+    "smax z15.s, p0/M, z15.s, z17.s\n"
+    "smax z14.s, p0/M, z14.s, z17.s\n"
+    "smax z13.s, p0/M, z13.s, z17.s\n"
+    "smax z12.s, p0/M, z12.s, z17.s\n"
+    "smin z15.s, p0/M, z15.s, z16.s\n"
+    "smin z14.s, p0/M, z14.s, z16.s\n"
     "trn1 z17.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z16.s\n"
+    "smin z12.s, p0/M, z12.s, z16.s\n"
     "trn1 z16.h, z13.h, z12.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
-    "incb x26\n"
-    "whilelt p3.b, x26, %x[n_channels]\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 06df1515ad..eae83b99fe 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,28 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const uint8_t *const *const, uint8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst
+struct sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<uint8_t, uint8_t>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
+  using Parent = DepthfirstStrategy<uint8_t, uint8_t>;
 
-  typedef void (*kern_type)(unsigned int, const uint8_t *const *const, uint8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+  sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int pool_rows(void) { return 2; }
-  constexpr static unsigned int pool_cols(void) { return 2; }
-
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
-
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
-  sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index e921f345d5..8612555bfb 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include <cstddef>
 #include <cstdint>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -63,84 +63,84 @@ void sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x14, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "mov x14, #0x0\n"
+    "whilelt p0.b, x14, x15\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x13, x12, [x21, #0x0]\n"
     "ptrue p2.b\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x13, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "mov x12, #0x0\n"
-    "ldp x11, x10, [x20, #0x0]\n"
-    "whilelt p1.b, x13, x14\n"
-    "ldp x9, x28, [x20, #0x10]\n"
-    "ldp x27, x26, [x19, #0x0]\n"
-    "ldp x25, x24, [x19, #0x10]\n"
-    "ldp x23, x22, [x19, #0x20]\n"
-    "ldp x21, x20, [x19, #0x30]\n"
-    "ldr x19, [x19, #0x40]\n"
-    "ld1b { z31.b }, p1/Z, [x26, x13]\n"
-    "ld1b { z30.b }, p1/Z, [x23, x13]\n"
-    "ld1b { z29.b }, p1/Z, [x20, x13]\n"
-    "ld1b { z28.b }, p1/Z, [x24, x13]\n"
-    "ld1b { z27.b }, p1/Z, [x27, x13]\n"
-    "ld1b { z26.b }, p1/Z, [x22, x13]\n"
-    "ld1b { z25.b }, p1/Z, [x25, x13]\n"
-    "ld1b { z24.b }, p1/Z, [x21, x13]\n"
-    "ld1b { z23.b }, p1/Z, [x19, x13]\n"
-    "incw x13\n"
-    "whilelt p1.b, x13, x14\n"
+    "mov x11, #0x0\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "ldp x28, x27, [x20, #0x0]\n"
+    "ldp x26, x25, [x20, #0x10]\n"
+    "ldp x24, x23, [x20, #0x20]\n"
+    "ldp x22, x21, [x20, #0x30]\n"
+    "ldr x20, [x20, #0x40]\n"
+    "ld1b { z31.b }, p0/Z, [x27, x14]\n"
+    "ld1b { z30.b }, p0/Z, [x24, x14]\n"
+    "ld1b { z29.b }, p0/Z, [x21, x14]\n"
+    "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+    "ld1b { z27.b }, p0/Z, [x28, x14]\n"
+    "ld1b { z26.b }, p0/Z, [x26, x14]\n"
+    "ld1b { z25.b }, p0/Z, [x23, x14]\n"
+    "ld1b { z24.b }, p0/Z, [x22, x14]\n"
+    "ld1b { z23.b }, p0/Z, [x20, x14]\n"
+    "incw x14\n"
+    "whilelt p1.b, x14, x15\n"
     "b.none 2f\n"
     "1:"  // Vector: Loop
     "movprfx z22, z31\n umax z22.b, p2/M, z22.b, z30.b\n"
-    "ld1b { z31.b }, p1/Z, [x26, x13]\n"
-    "whilelt p0.b, x12, x14\n"
     "movprfx z21, z30\n umax z21.b, p2/M, z21.b, z29.b\n"
-    "ld1b { z30.b }, p1/Z, [x23, x13]\n"
-    "movprfx z18, z28\n umax z18.b, p2/M, z18.b, z27.b\n"
-    "ld1b { z29.b }, p1/Z, [x20, x13]\n"
-    "movprfx z17, z26\n umax z17.b, p2/M, z17.b, z25.b\n"
-    "ld1b { z27.b }, p1/Z, [x27, x13]\n"
-    "movprfx z16, z24\n umax z16.b, p2/M, z16.b, z28.b\n"
-    "ld1b { z28.b }, p1/Z, [x24, x13]\n"
-    "movprfx z20, z26\n umax z20.b, p2/M, z20.b, z23.b\n"
-    "ld1b { z26.b }, p1/Z, [x22, x13]\n"
-    "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z18.b\n"
-    "ld1b { z25.b }, p1/Z, [x25, x13]\n"
-    "movprfx z18, z22\n umax z18.b, p2/M, z18.b, z17.b\n"
-    "ld1b { z24.b }, p1/Z, [x21, x13]\n"
-    "movprfx z17, z21\n umax z17.b, p2/M, z17.b, z16.b\n"
-    "ld1b { z23.b }, p1/Z, [x19, x13]\n"
-    "incw x13\n"
-    "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z20.b\n"
-    "st1b { z19.b }, p0, [x11, x12]\n"
-    "whilelt p1.b, x13, x14\n"
-    "st1b { z18.b }, p0, [x10, x12]\n"
-    "st1b { z17.b }, p0, [x9, x12]\n"
-    "st1b { z16.b }, p0, [x28, x12]\n"
-    "incw x12\n"
+    "ld1b { z31.b }, p1/Z, [x27, x14]\n"
+    "ld1b { z30.b }, p1/Z, [x24, x14]\n"
+    "movprfx z20, z28\n umax z20.b, p2/M, z20.b, z27.b\n"
+    "movprfx z19, z26\n umax z19.b, p2/M, z19.b, z25.b\n"
+    "ld1b { z29.b }, p1/Z, [x21, x14]\n"
+    "ld1b { z27.b }, p1/Z, [x28, x14]\n"
+    "movprfx z17, z28\n umax z17.b, p2/M, z17.b, z24.b\n"
+    "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z23.b\n"
+    "ld1b { z28.b }, p1/Z, [x25, x14]\n"
+    "ld1b { z26.b }, p1/Z, [x26, x14]\n"
+    "ld1b { z25.b }, p1/Z, [x23, x14]\n"
+    "ld1b { z24.b }, p1/Z, [x22, x14]\n"
+    "whilelt p0.b, x11, x15\n"
+    "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
+    "ld1b { z23.b }, p1/Z, [x20, x14]\n"
+    "incw x14\n"
+    "whilelt p1.b, x14, x15\n"
+    "st1b { z16.b }, p0, [x13, x11]\n"
+    "movprfx z16, z19\n umax z16.b, p2/M, z16.b, z22.b\n"
+    "umax z17.b, p2/M, z17.b, z21.b\n"
+    "st1b { z16.b }, p0, [x12, x11]\n"
+    "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z18.b\n"
+    "st1b { z17.b }, p0, [x10, x11]\n"
+    "st1b { z16.b }, p0, [x9, x11]\n"
+    "incw x11\n"
     "b.any 1b\n"
     "2:"  // Vector: Tail
     "movprfx z22, z31\n umax z22.b, p2/M, z22.b, z30.b\n"
-    "whilelt p0.b, x12, x14\n"
     "movprfx z21, z30\n umax z21.b, p2/M, z21.b, z29.b\n"
-    "movprfx z18, z28\n umax z18.b, p2/M, z18.b, z27.b\n"
-    "movprfx z17, z26\n umax z17.b, p2/M, z17.b, z25.b\n"
-    "movprfx z16, z24\n umax z16.b, p2/M, z16.b, z28.b\n"
-    "movprfx z20, z26\n umax z20.b, p2/M, z20.b, z23.b\n"
-    "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z18.b\n"
-    "st1b { z19.b }, p0, [x11, x12]\n"
-    "movprfx z18, z22\n umax z18.b, p2/M, z18.b, z17.b\n"
-    "movprfx z17, z21\n umax z17.b, p2/M, z17.b, z16.b\n"
-    "st1b { z18.b }, p0, [x10, x12]\n"
-    "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z20.b\n"
-    "st1b { z17.b }, p0, [x9, x12]\n"
-    "st1b { z16.b }, p0, [x28, x12]\n"
+    "movprfx z20, z28\n umax z20.b, p2/M, z20.b, z27.b\n"
+    "movprfx z19, z26\n umax z19.b, p2/M, z19.b, z25.b\n"
+    "movprfx z17, z28\n umax z17.b, p2/M, z17.b, z24.b\n"
+    "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z23.b\n"
+    "whilelt p0.b, x11, x15\n"
+    "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
+    "st1b { z16.b }, p0, [x13, x11]\n"
+    "movprfx z16, z19\n umax z16.b, p2/M, z16.b, z22.b\n"
+    "umax z17.b, p2/M, z17.b, z21.b\n"
+    "st1b { z16.b }, p0, [x12, x11]\n"
+    "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z18.b\n"
+    "st1b { z17.b }, p0, [x10, x11]\n"
+    "st1b { z16.b }, p0, [x9, x11]\n"
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
-    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst.hpp
index 59cd4b9c78..9f3c3a435d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_u8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
 
-struct sve_u8_nhwc_max_generic_depthfirst
+struct sve_u8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = sve_u8_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
   sve_u8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_u8_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
index 164847480b..be0eb398ae 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,9 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -39,185 +40,184 @@ void sve_u8_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x28, #0x0\n"
-    "cntb x27\n"
-    "cntb x26, ALL, MUL #2\n"
-    "cntb x25, ALL, MUL #3\n"
+    "mov x9, #0x0\n"
+    "cntb x28\n"
+    "cntb x27, ALL, MUL #2\n"
+    "cntb x26, ALL, MUL #3\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "whilelt p3.b, x28, %x[n_channels]\n"
     "whilelt p2.b, x27, %x[n_channels]\n"
     "whilelt p1.b, x26, %x[n_channels]\n"
-    "whilelt p0.b, x25, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.b, #0x0\n"
     "mov z7.b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
+    "mov x24, %x[inptrs]\n"
     "mov z6.b, #0x0\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "mov z5.b, #0x0\n"
-    "mov z4.b, #0x0\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "ld1b { z30.b }, p2/Z, [x22, x27]\n"
-    "ld1b { z22.b }, p2/Z, [x21, x27]\n"
-    "ld1b { z29.b }, p2/Z, [x20, x27]\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "ld1b { z27.b }, p1/Z, [x22, x26]\n"
-    "ld1b { z21.b }, p1/Z, [x21, x26]\n"
-    "ld1b { z26.b }, p1/Z, [x20, x26]\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "ld1b { z25.b }, p0/Z, [x22, x25]\n"
-    "ld1b { z20.b }, p0/Z, [x21, x25]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+    "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "movprfx z18, z31\n umax z18.b, p4/M, z18.b, z30.b\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "umax z22.b, p4/M, z22.b, z29.b\n"
-    "movprfx z17, z28\n umax z17.b, p4/M, z17.b, z27.b\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "umax z21.b, p4/M, z21.b, z26.b\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "umax z16.b, p4/M, z16.b, z25.b\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
-    "umax z20.b, p4/M, z20.b, z24.b\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "umax z19.b, p4/M, z19.b, z23.b\n"
-    "ld1b { z30.b }, p2/Z, [x22, x27]\n"
-    "umax z18.b, p4/M, z18.b, z22.b\n"
-    "ld1b { z22.b }, p2/Z, [x21, x27]\n"
-    "umax z17.b, p4/M, z17.b, z21.b\n"
-    "ld1b { z29.b }, p2/Z, [x20, x27]\n"
-    "umax z16.b, p4/M, z16.b, z20.b\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "umax z7.b, p4/M, z7.b, z19.b\n"
-    "ld1b { z27.b }, p1/Z, [x22, x26]\n"
-    "umax z6.b, p4/M, z6.b, z18.b\n"
-    "ld1b { z21.b }, p1/Z, [x21, x26]\n"
-    "umax z5.b, p4/M, z5.b, z17.b\n"
-    "ld1b { z26.b }, p1/Z, [x20, x26]\n"
-    "umax z4.b, p4/M, z4.b, z16.b\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "ld1b { z25.b }, p0/Z, [x22, x25]\n"
-    "ld1b { z20.b }, p0/Z, [x21, x25]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+    "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
+    "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
+    "umax z22.b, p0/M, z22.b, z30.b\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
+    "umax z21.b, p0/M, z21.b, z27.b\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+    "movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
+    "umax z20.b, p0/M, z20.b, z24.b\n"
+    "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+    "umax z19.b, p0/M, z19.b, z23.b\n"
+    "umax z18.b, p0/M, z18.b, z22.b\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+    "umax z17.b, p0/M, z17.b, z21.b\n"
+    "umax z16.b, p0/M, z16.b, z20.b\n"
+    "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "umax z8.b, p0/M, z8.b, z19.b\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+    "umax z7.b, p0/M, z7.b, z18.b\n"
+    "umax z6.b, p0/M, z6.b, z17.b\n"
+    "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
-    "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
-    "movprfx z18, z31\n umax z18.b, p4/M, z18.b, z30.b\n"
-    "umax z22.b, p4/M, z22.b, z29.b\n"
-    "movprfx z17, z28\n umax z17.b, p4/M, z17.b, z27.b\n"
-    "umax z21.b, p4/M, z21.b, z26.b\n"
-    "umax z16.b, p4/M, z16.b, z25.b\n"
-    "umax z20.b, p4/M, z20.b, z24.b\n"
-    "umax z19.b, p4/M, z19.b, z23.b\n"
-    "umax z18.b, p4/M, z18.b, z22.b\n"
-    "umax z17.b, p4/M, z17.b, z21.b\n"
-    "umax z16.b, p4/M, z16.b, z20.b\n"
-    "umax z7.b, p4/M, z7.b, z19.b\n"
-    "umax z6.b, p4/M, z6.b, z18.b\n"
-    "umax z5.b, p4/M, z5.b, z17.b\n"
-    "umax z4.b, p4/M, z4.b, z16.b\n"
+    "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
+    "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
+    "movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
+    "umax z22.b, p0/M, z22.b, z30.b\n"
+    "movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
+    "umax z21.b, p0/M, z21.b, z27.b\n"
+    "movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
+    "umax z20.b, p0/M, z20.b, z24.b\n"
+    "umax z19.b, p0/M, z19.b, z23.b\n"
+    "umax z18.b, p0/M, z18.b, z22.b\n"
+    "umax z17.b, p0/M, z17.b, z21.b\n"
+    "umax z16.b, p0/M, z16.b, z20.b\n"
+    "umax z8.b, p0/M, z8.b, z19.b\n"
+    "umax z7.b, p0/M, z7.b, z18.b\n"
+    "umax z6.b, p0/M, z6.b, z17.b\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "umax z7.b, p4/M, z7.b, z3.b\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "umax z6.b, p4/M, z6.b, z31.b\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "umax z5.b, p4/M, z5.b, z28.b\n"
-    "umax z4.b, p4/M, z4.b, z16.b\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "umax z8.b, p0/M, z8.b, z16.b\n"
+    "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+    "umax z7.b, p0/M, z7.b, z17.b\n"
+    "umax z6.b, p0/M, z6.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
+    "st1b { z8.b }, p4, [%x[outptr], x9]\n"
+    "incb x9, ALL, MUL #4\n"
     "st1b { z7.b }, p3, [%x[outptr], x28]\n"
     "incb x28, ALL, MUL #4\n"
     "st1b { z6.b }, p2, [%x[outptr], x27]\n"
     "incb x27, ALL, MUL #4\n"
     "st1b { z5.b }, p1, [%x[outptr], x26]\n"
     "incb x26, ALL, MUL #4\n"
-    "st1b { z4.b }, p0, [%x[outptr], x25]\n"
-    "incb x25, ALL, MUL #4\n"
-    "whilelt p0.b, x25, %x[n_channels]\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.b, x28, %x[n_channels]\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov z7.b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "umax z19.b, p4/M, z19.b, z23.b\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "umax z7.b, p4/M, z7.b, z19.b\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+    "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+    "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "umax z16.b, p0/M, z16.b, z17.b\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "umax z8.b, p0/M, z8.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
-    "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
-    "umax z19.b, p4/M, z19.b, z23.b\n"
-    "umax z7.b, p4/M, z7.b, z19.b\n"
+    "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+    "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+    "umax z16.b, p0/M, z16.b, z17.b\n"
+    "umax z8.b, p0/M, z8.b, z16.b\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "umax z7.b, p4/M, z7.b, z3.b\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "umax z8.b, p0/M, z8.b, z16.b\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "st1b { z7.b }, p3, [%x[outptr], x28]\n"
-    "incb x28\n"
-    "whilelt p3.b, x28, %x[n_channels]\n"
+    "st1b { z8.b }, p4, [%x[outptr], x9]\n"
+    "incb x9\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst.hpp
index f6fc1a58c1..f9d25a1b45 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_u8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
 
-struct sve_u8q_nhwc_avg_generic_depthfirst
+struct sve_u8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = sve_u8q_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
   sve_u8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_u8q_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
index 373848ad2b..e8339a2cd9 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,12 @@
 
 #include "pooling.hpp"
 #include <cstdint>
+#include <cstddef>
 #include <cstring>
 #include <cmath>
 
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -86,12 +87,13 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
       f_rescale_value *= 2.0f;
     }
 
-    rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
-    if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
     {
       shift_value++;
-      rescale_value >>= 1;
+      long_rescale_value >>= 1;
     }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
   }
 
 
@@ -117,24 +119,24 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
   );
 
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x26, #0x0\n"
-    "cntb x25\n"
-    "cntb x24, ALL, MUL #2\n"
-    "cntb x23, ALL, MUL #3\n"
+    "mov x27, #0x0\n"
+    "cntb x26\n"
+    "cntb x25, ALL, MUL #2\n"
+    "cntb x24, ALL, MUL #3\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "whilelt p3.b, x26, %x[n_channels]\n"
     "whilelt p2.b, x25, %x[n_channels]\n"
     "whilelt p1.b, x24, %x[n_channels]\n"
-    "whilelt p0.b, x23, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
-    "ld1rw { z15.s }, p4/Z, [%x[accumulator_init]]\n"
+    "ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov z14.d, z15.d\n"
-    "mov x19, %x[inptrs]\n"
     "mov z13.d, z15.d\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
     "mov z12.d, z15.d\n"
     "mov z11.d, z15.d\n"
+    "mov x22, %x[inptrs]\n"
     "mov z10.d, z15.d\n"
     "mov z9.d, z15.d\n"
     "mov z8.d, z15.d\n"
@@ -146,43 +148,43 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
     "mov z2.d, z15.d\n"
     "mov z1.d, z15.d\n"
     "mov z0.d, z15.d\n"
-    "cbz x22, 4f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
-    "ld1b { z28.b }, p2/Z, [x20, x25]\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
-    "ld1b { z26.b }, p1/Z, [x20, x24]\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 2 inputs loop
     ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
     ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
     ".inst 0x455c0bb5  // uaddlb z21.h, z29.b, z28.b\n"
-    "subs x22, x22, #0x1\n"
     ".inst 0x455c0fb4  // uaddlt z20.h, z29.b, z28.b\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
     ".inst 0x455a0b73  // uaddlb z19.h, z27.b, z26.b\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
     ".inst 0x455a0f72  // uaddlt z18.h, z27.b, z26.b\n"
-    "ld1b { z28.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
     ".inst 0x45580b31  // uaddlb z17.h, z25.b, z24.b\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
     ".inst 0x45580f30  // uaddlt z16.h, z25.b, z24.b\n"
-    "ld1b { z26.b }, p1/Z, [x20, x24]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
     ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
     ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
-    "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
     ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
     ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
     ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
     ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
     ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
@@ -222,265 +224,264 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
     ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
     ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x21, %x[n_valid_cells], #0x1\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x4508abf1  // ushllb z17.h, z31.b, #0x0\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
-    ".inst 0x4508aff0  // ushllt z16.h, z31.b, #0x0\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
-    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
-    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
-    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
-    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
-    ".inst 0x4508abb0  // ushllb z16.h, z29.b, #0x0\n"
-    ".inst 0x4590496b  // uaddwb z11.s, z11.s, z16.h\n"
-    ".inst 0x45904d4a  // uaddwt z10.s, z10.s, z16.h\n"
-    ".inst 0x4508afb0  // ushllt z16.h, z29.b, #0x0\n"
-    ".inst 0x45904929  // uaddwb z9.s, z9.s, z16.h\n"
-    ".inst 0x45904d08  // uaddwt z8.s, z8.s, z16.h\n"
-    ".inst 0x4508ab70  // ushllb z16.h, z27.b, #0x0\n"
-    ".inst 0x459048e7  // uaddwb z7.s, z7.s, z16.h\n"
-    ".inst 0x45904cc6  // uaddwt z6.s, z6.s, z16.h\n"
-    ".inst 0x4508af70  // ushllt z16.h, z27.b, #0x0\n"
-    ".inst 0x459048a5  // uaddwb z5.s, z5.s, z16.h\n"
-    ".inst 0x45904c84  // uaddwt z4.s, z4.s, z16.h\n"
-    ".inst 0x4508ab30  // ushllb z16.h, z25.b, #0x0\n"
-    ".inst 0x45904863  // uaddwb z3.s, z3.s, z16.h\n"
-    ".inst 0x45904c42  // uaddwt z2.s, z2.s, z16.h\n"
-    ".inst 0x4508af30  // ushllt z16.h, z25.b, #0x0\n"
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508aa17  // ushllb z23.h, z16.b, #0x0\n"
+    ".inst 0x4508ae16  // ushllt z22.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x4508aa15  // ushllb z21.h, z16.b, #0x0\n"
+    ".inst 0x4508ae14  // ushllt z20.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x4508aa33  // ushllb z19.h, z17.b, #0x0\n"
+    ".inst 0x4508ae32  // ushllt z18.h, z17.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0x4508aa11  // ushllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508ae10  // ushllt z16.h, z16.b, #0x0\n"
+    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944d08  // uaddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459348e7  // uaddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x45934cc6  // uaddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459248a5  // uaddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924c84  // uaddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914863  // uaddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914c42  // uaddwt z2.s, z2.s, z17.h\n"
     ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
     ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "mov z21.s, #0x0\n"
-    "ld1rw { z20.s }, p4/Z, [%x[combined_rescale_value]]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    "mov z19.s, #0xff\n"
-    "ld1rw { z18.s }, p4/Z, [%x[left_shift]]\n"
-    "ld1rw { z17.s }, p4/Z, [%x[right_shift]]\n"
-    ".inst 0x4482924f  // srshl z15.s, p4/M, z15.s, z18.s\n"
-    "ld1rw { z16.s }, p4/Z, [x19]\n"
-    ".inst 0x4482924e  // srshl z14.s, p4/M, z14.s, z18.s\n"
-    ".inst 0x4482924d  // srshl z13.s, p4/M, z13.s, z18.s\n"
-    ".inst 0x4482924c  // srshl z12.s, p4/M, z12.s, z18.s\n"
-    ".inst 0x4482924b  // srshl z11.s, p4/M, z11.s, z18.s\n"
-    ".inst 0x04b475ef  // sqrdmulh z15.s, z15.s, z20.s\n"
-    ".inst 0x04b475ce  // sqrdmulh z14.s, z14.s, z20.s\n"
-    ".inst 0x04b475ad  // sqrdmulh z13.s, z13.s, z20.s\n"
-    ".inst 0x04b4758c  // sqrdmulh z12.s, z12.s, z20.s\n"
-    ".inst 0x04b4756b  // sqrdmulh z11.s, z11.s, z20.s\n"
-    ".inst 0x4482922f  // srshl z15.s, p4/M, z15.s, z17.s\n"
-    ".inst 0x4482922e  // srshl z14.s, p4/M, z14.s, z17.s\n"
-    ".inst 0x4482922d  // srshl z13.s, p4/M, z13.s, z17.s\n"
-    ".inst 0x4482922c  // srshl z12.s, p4/M, z12.s, z17.s\n"
+    "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
+    "ld1rw { z16.s }, p0/Z, [%x[combined_rescale_value]]\n"
+    ".inst 0x4482824f  // srshl z15.s, p0/M, z15.s, z18.s\n"
+    ".inst 0x4482824e  // srshl z14.s, p0/M, z14.s, z18.s\n"
+    ".inst 0x4482824d  // srshl z13.s, p0/M, z13.s, z18.s\n"
+    ".inst 0x4482824c  // srshl z12.s, p0/M, z12.s, z18.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
+    ".inst 0x04b075ef  // sqrdmulh z15.s, z15.s, z16.s\n"
+    ".inst 0x4482824b  // srshl z11.s, p0/M, z11.s, z18.s\n"
+    ".inst 0x4482824a  // srshl z10.s, p0/M, z10.s, z18.s\n"
+    ".inst 0x04b075ce  // sqrdmulh z14.s, z14.s, z16.s\n"
+    ".inst 0x04b075ad  // sqrdmulh z13.s, z13.s, z16.s\n"
+    ".inst 0x44828249  // srshl z9.s, p0/M, z9.s, z18.s\n"
+    ".inst 0x44828248  // srshl z8.s, p0/M, z8.s, z18.s\n"
+    ".inst 0x04b0758c  // sqrdmulh z12.s, z12.s, z16.s\n"
+    ".inst 0x04b0756b  // sqrdmulh z11.s, z11.s, z16.s\n"
+    ".inst 0x44828247  // srshl z7.s, p0/M, z7.s, z18.s\n"
+    ".inst 0x44828246  // srshl z6.s, p0/M, z6.s, z18.s\n"
+    ".inst 0x04b0754a  // sqrdmulh z10.s, z10.s, z16.s\n"
+    ".inst 0x04b07529  // sqrdmulh z9.s, z9.s, z16.s\n"
+    ".inst 0x44828245  // srshl z5.s, p0/M, z5.s, z18.s\n"
+    ".inst 0x44828244  // srshl z4.s, p0/M, z4.s, z18.s\n"
+    ".inst 0x04b07508  // sqrdmulh z8.s, z8.s, z16.s\n"
+    ".inst 0x04b074e7  // sqrdmulh z7.s, z7.s, z16.s\n"
+    ".inst 0x44828243  // srshl z3.s, p0/M, z3.s, z18.s\n"
+    ".inst 0x44828242  // srshl z2.s, p0/M, z2.s, z18.s\n"
+    ".inst 0x04b074c6  // sqrdmulh z6.s, z6.s, z16.s\n"
+    ".inst 0x04b074a5  // sqrdmulh z5.s, z5.s, z16.s\n"
+    ".inst 0x44828241  // srshl z1.s, p0/M, z1.s, z18.s\n"
+    ".inst 0x44828240  // srshl z0.s, p0/M, z0.s, z18.s\n"
+    ".inst 0x04b07484  // sqrdmulh z4.s, z4.s, z16.s\n"
+    ".inst 0x04b07463  // sqrdmulh z3.s, z3.s, z16.s\n"
+    ".inst 0x04b07442  // sqrdmulh z2.s, z2.s, z16.s\n"
+    ".inst 0x04b07421  // sqrdmulh z1.s, z1.s, z16.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    ".inst 0x4482822f  // srshl z15.s, p0/M, z15.s, z17.s\n"
+    ".inst 0x04b07400  // sqrdmulh z0.s, z0.s, z16.s\n"
+    ".inst 0x4482822e  // srshl z14.s, p0/M, z14.s, z17.s\n"
+    ".inst 0x4482822d  // srshl z13.s, p0/M, z13.s, z17.s\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x4482822c  // srshl z12.s, p0/M, z12.s, z17.s\n"
+    ".inst 0x4482822b  // srshl z11.s, p0/M, z11.s, z17.s\n"
     "add z15.s, z15.s, z16.s\n"
     "add z14.s, z14.s, z16.s\n"
+    ".inst 0x4482822a  // srshl z10.s, p0/M, z10.s, z17.s\n"
+    ".inst 0x44828229  // srshl z9.s, p0/M, z9.s, z17.s\n"
     "add z13.s, z13.s, z16.s\n"
     "add z12.s, z12.s, z16.s\n"
-    ".inst 0x4482922b  // srshl z11.s, p4/M, z11.s, z17.s\n"
-    ".inst 0x4482924a  // srshl z10.s, p4/M, z10.s, z18.s\n"
-    ".inst 0x44829249  // srshl z9.s, p4/M, z9.s, z18.s\n"
-    ".inst 0x44829248  // srshl z8.s, p4/M, z8.s, z18.s\n"
+    ".inst 0x44828228  // srshl z8.s, p0/M, z8.s, z17.s\n"
+    ".inst 0x44828227  // srshl z7.s, p0/M, z7.s, z17.s\n"
     "add z11.s, z11.s, z16.s\n"
-    ".inst 0x04b4754a  // sqrdmulh z10.s, z10.s, z20.s\n"
-    ".inst 0x04b47529  // sqrdmulh z9.s, z9.s, z20.s\n"
-    ".inst 0x04b47508  // sqrdmulh z8.s, z8.s, z20.s\n"
-    ".inst 0x44829247  // srshl z7.s, p4/M, z7.s, z18.s\n"
-    ".inst 0x4482922a  // srshl z10.s, p4/M, z10.s, z17.s\n"
-    ".inst 0x44829229  // srshl z9.s, p4/M, z9.s, z17.s\n"
-    ".inst 0x44829228  // srshl z8.s, p4/M, z8.s, z17.s\n"
-    ".inst 0x04b474e7  // sqrdmulh z7.s, z7.s, z20.s\n"
     "add z10.s, z10.s, z16.s\n"
+    ".inst 0x44828226  // srshl z6.s, p0/M, z6.s, z17.s\n"
+    ".inst 0x44828225  // srshl z5.s, p0/M, z5.s, z17.s\n"
     "add z9.s, z9.s, z16.s\n"
     "add z8.s, z8.s, z16.s\n"
-    ".inst 0x44829227  // srshl z7.s, p4/M, z7.s, z17.s\n"
-    ".inst 0x44829246  // srshl z6.s, p4/M, z6.s, z18.s\n"
-    ".inst 0x44829245  // srshl z5.s, p4/M, z5.s, z18.s\n"
-    ".inst 0x44829244  // srshl z4.s, p4/M, z4.s, z18.s\n"
+    ".inst 0x44828224  // srshl z4.s, p0/M, z4.s, z17.s\n"
+    ".inst 0x44828223  // srshl z3.s, p0/M, z3.s, z17.s\n"
     "add z7.s, z7.s, z16.s\n"
-    ".inst 0x04b474c6  // sqrdmulh z6.s, z6.s, z20.s\n"
-    ".inst 0x04b474a5  // sqrdmulh z5.s, z5.s, z20.s\n"
-    ".inst 0x04b47484  // sqrdmulh z4.s, z4.s, z20.s\n"
-    ".inst 0x44829243  // srshl z3.s, p4/M, z3.s, z18.s\n"
-    ".inst 0x44829226  // srshl z6.s, p4/M, z6.s, z17.s\n"
-    ".inst 0x44829225  // srshl z5.s, p4/M, z5.s, z17.s\n"
-    ".inst 0x44829224  // srshl z4.s, p4/M, z4.s, z17.s\n"
-    ".inst 0x04b47463  // sqrdmulh z3.s, z3.s, z20.s\n"
     "add z6.s, z6.s, z16.s\n"
+    ".inst 0x44828222  // srshl z2.s, p0/M, z2.s, z17.s\n"
+    ".inst 0x44828221  // srshl z1.s, p0/M, z1.s, z17.s\n"
     "add z5.s, z5.s, z16.s\n"
     "add z4.s, z4.s, z16.s\n"
-    ".inst 0x44829223  // srshl z3.s, p4/M, z3.s, z17.s\n"
-    ".inst 0x44829242  // srshl z2.s, p4/M, z2.s, z18.s\n"
-    ".inst 0x44829241  // srshl z1.s, p4/M, z1.s, z18.s\n"
-    ".inst 0x44829240  // srshl z0.s, p4/M, z0.s, z18.s\n"
+    ".inst 0x44828220  // srshl z0.s, p0/M, z0.s, z17.s\n"
     "add z3.s, z3.s, z16.s\n"
-    ".inst 0x04b47442  // sqrdmulh z2.s, z2.s, z20.s\n"
-    ".inst 0x04b47421  // sqrdmulh z1.s, z1.s, z20.s\n"
-    ".inst 0x04b47400  // sqrdmulh z0.s, z0.s, z20.s\n"
-    "smax z15.s, p4/M, z15.s, z21.s\n"
-    ".inst 0x44829222  // srshl z2.s, p4/M, z2.s, z17.s\n"
-    ".inst 0x44829221  // srshl z1.s, p4/M, z1.s, z17.s\n"
-    ".inst 0x44829220  // srshl z0.s, p4/M, z0.s, z17.s\n"
-    "smin z15.s, p4/M, z15.s, z19.s\n"
     "add z2.s, z2.s, z16.s\n"
     "add z1.s, z1.s, z16.s\n"
     "add z0.s, z0.s, z16.s\n"
-    "smax z14.s, p4/M, z14.s, z21.s\n"
-    "smax z13.s, p4/M, z13.s, z21.s\n"
-    "smax z12.s, p4/M, z12.s, z21.s\n"
-    "smax z11.s, p4/M, z11.s, z21.s\n"
-    "smin z14.s, p4/M, z14.s, z19.s\n"
-    "smin z13.s, p4/M, z13.s, z19.s\n"
-    "smin z12.s, p4/M, z12.s, z19.s\n"
-    "smin z11.s, p4/M, z11.s, z19.s\n"
+    "mov z16.s, #0x0\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "mov z18.s, #0xff\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smax z11.s, p0/M, z11.s, z16.s\n"
+    "smax z10.s, p0/M, z10.s, z16.s\n"
+    "smax z9.s, p0/M, z9.s, z16.s\n"
+    "smax z8.s, p0/M, z8.s, z16.s\n"
+    "smax z7.s, p0/M, z7.s, z16.s\n"
+    "smax z6.s, p0/M, z6.s, z16.s\n"
+    "smax z5.s, p0/M, z5.s, z16.s\n"
+    "smax z4.s, p0/M, z4.s, z16.s\n"
+    "smax z3.s, p0/M, z3.s, z16.s\n"
+    "smax z2.s, p0/M, z2.s, z16.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z18.s\n"
+    "smin z14.s, p0/M, z14.s, z18.s\n"
     "trn1 z17.h, z15.h, z14.h\n"
-    "smax z10.s, p4/M, z10.s, z21.s\n"
+    "smin z13.s, p0/M, z13.s, z18.s\n"
+    "smin z12.s, p0/M, z12.s, z18.s\n"
     "trn1 z16.h, z13.h, z12.h\n"
-    "smax z9.s, p4/M, z9.s, z21.s\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
-    "smin z10.s, p4/M, z10.s, z19.s\n"
-    "incb x26, ALL, MUL #4\n"
-    "smin z9.s, p4/M, z9.s, z19.s\n"
-    "smax z8.s, p4/M, z8.s, z21.s\n"
-    "smax z7.s, p4/M, z7.s, z21.s\n"
-    "smax z6.s, p4/M, z6.s, z21.s\n"
-    "trn1 z18.h, z11.h, z10.h\n"
-    "smin z8.s, p4/M, z8.s, z19.s\n"
-    "smin z7.s, p4/M, z7.s, z19.s\n"
-    "smin z6.s, p4/M, z6.s, z19.s\n"
-    "smax z5.s, p4/M, z5.s, z21.s\n"
+    "smin z11.s, p0/M, z11.s, z18.s\n"
+    "smin z10.s, p0/M, z10.s, z18.s\n"
+    "trn1 z17.h, z11.h, z10.h\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "smin z9.s, p0/M, z9.s, z18.s\n"
+    "smin z8.s, p0/M, z8.s, z18.s\n"
     "trn1 z16.h, z9.h, z8.h\n"
-    "smax z4.s, p4/M, z4.s, z21.s\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "smin z7.s, p0/M, z7.s, z18.s\n"
+    "smin z6.s, p0/M, z6.s, z18.s\n"
     "trn1 z17.h, z7.h, z6.h\n"
-    "trn1 z16.b, z18.b, z16.b\n"
-    "st1b { z16.b }, p2, [%x[outptr], x25]\n"
-    "smin z5.s, p4/M, z5.s, z19.s\n"
-    "incb x25, ALL, MUL #4\n"
-    "smin z4.s, p4/M, z4.s, z19.s\n"
-    "smax z3.s, p4/M, z3.s, z21.s\n"
-    "smax z2.s, p4/M, z2.s, z21.s\n"
-    "smax z1.s, p4/M, z1.s, z21.s\n"
-    "smax z0.s, p4/M, z0.s, z21.s\n"
+    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
+    "smin z5.s, p0/M, z5.s, z18.s\n"
+    "smin z4.s, p0/M, z4.s, z18.s\n"
     "trn1 z16.h, z5.h, z4.h\n"
-    "smin z3.s, p4/M, z3.s, z19.s\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
-    "smin z2.s, p4/M, z2.s, z19.s\n"
-    "incb x24, ALL, MUL #4\n"
-    "smin z1.s, p4/M, z1.s, z19.s\n"
-    "smin z0.s, p4/M, z0.s, z19.s\n"
+    "smin z3.s, p0/M, z3.s, z18.s\n"
+    "smin z2.s, p0/M, z2.s, z18.s\n"
     "trn1 z17.h, z3.h, z2.h\n"
+    "st1b { z16.b }, p2, [%x[outptr], x25]\n"
+    "smin z1.s, p0/M, z1.s, z18.s\n"
+    "smin z0.s, p0/M, z0.s, z18.s\n"
     "trn1 z16.h, z1.h, z0.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p0, [%x[outptr], x23]\n"
-    "incb x23, ALL, MUL #4\n"
-    "whilelt p0.b, x23, %x[n_channels]\n"
+    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+    "incb x24, ALL, MUL #4\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "incb x27, ALL, MUL #4\n"
+    "incb x26, ALL, MUL #4\n"
+    "incb x25, ALL, MUL #4\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.b, x26, %x[n_channels]\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
-    "ld1rw { z15.s }, p4/Z, [%x[accumulator_init]]\n"
+    "ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov z14.d, z15.d\n"
-    "mov x19, %x[inptrs]\n"
     "mov z13.d, z15.d\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
     "mov z12.d, z15.d\n"
-    "cbz x22, 11f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 2 inputs loop
-    ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
-    ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
-    "subs x22, x22, #0x1\n"
-    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
-    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x455e0bf1  // uaddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff0  // uaddlt z16.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 2 inputs tail
-    ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
-    ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
-    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
-    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
-    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
-    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x455e0bf1  // uaddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff0  // uaddlt z16.h, z31.b, z30.b\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x21, %x[n_valid_cells], #0x1\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x4508abf1  // ushllb z17.h, z31.b, #0x0\n"
-    ".inst 0x4508aff0  // ushllt z16.h, z31.b, #0x0\n"
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508aa11  // ushllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508ae10  // ushllt z16.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
     ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
     ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
     ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
     ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "mov z21.s, #0x0\n"
-    "ld1rw { z20.s }, p4/Z, [%x[combined_rescale_value]]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    "mov z19.s, #0xff\n"
-    "ld1rw { z18.s }, p4/Z, [%x[left_shift]]\n"
-    "ld1rw { z17.s }, p4/Z, [%x[right_shift]]\n"
-    ".inst 0x4482924f  // srshl z15.s, p4/M, z15.s, z18.s\n"
-    "ld1rw { z16.s }, p4/Z, [x19]\n"
-    ".inst 0x4482924e  // srshl z14.s, p4/M, z14.s, z18.s\n"
-    ".inst 0x4482924d  // srshl z13.s, p4/M, z13.s, z18.s\n"
-    ".inst 0x4482924c  // srshl z12.s, p4/M, z12.s, z18.s\n"
-    ".inst 0x04b475ef  // sqrdmulh z15.s, z15.s, z20.s\n"
-    ".inst 0x04b475ce  // sqrdmulh z14.s, z14.s, z20.s\n"
-    ".inst 0x04b475ad  // sqrdmulh z13.s, z13.s, z20.s\n"
-    ".inst 0x04b4758c  // sqrdmulh z12.s, z12.s, z20.s\n"
-    ".inst 0x4482922f  // srshl z15.s, p4/M, z15.s, z17.s\n"
-    ".inst 0x4482922e  // srshl z14.s, p4/M, z14.s, z17.s\n"
-    ".inst 0x4482922d  // srshl z13.s, p4/M, z13.s, z17.s\n"
-    ".inst 0x4482922c  // srshl z12.s, p4/M, z12.s, z17.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[left_shift]]\n"
+    "ld1rw { z16.s }, p0/Z, [%x[combined_rescale_value]]\n"
+    ".inst 0x4482822f  // srshl z15.s, p0/M, z15.s, z17.s\n"
+    ".inst 0x4482822e  // srshl z14.s, p0/M, z14.s, z17.s\n"
+    ".inst 0x4482822d  // srshl z13.s, p0/M, z13.s, z17.s\n"
+    ".inst 0x4482822c  // srshl z12.s, p0/M, z12.s, z17.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
+    ".inst 0x04b075ef  // sqrdmulh z15.s, z15.s, z16.s\n"
+    ".inst 0x04b075ce  // sqrdmulh z14.s, z14.s, z16.s\n"
+    ".inst 0x04b075ad  // sqrdmulh z13.s, z13.s, z16.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    ".inst 0x4482822f  // srshl z15.s, p0/M, z15.s, z17.s\n"
+    ".inst 0x04b0758c  // sqrdmulh z12.s, z12.s, z16.s\n"
+    ".inst 0x4482822e  // srshl z14.s, p0/M, z14.s, z17.s\n"
+    ".inst 0x4482822d  // srshl z13.s, p0/M, z13.s, z17.s\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x4482822c  // srshl z12.s, p0/M, z12.s, z17.s\n"
     "add z15.s, z15.s, z16.s\n"
     "add z14.s, z14.s, z16.s\n"
     "add z13.s, z13.s, z16.s\n"
     "add z12.s, z12.s, z16.s\n"
-    "smax z15.s, p4/M, z15.s, z21.s\n"
-    "smax z14.s, p4/M, z14.s, z21.s\n"
-    "smax z13.s, p4/M, z13.s, z21.s\n"
-    "smax z12.s, p4/M, z12.s, z21.s\n"
-    "smin z15.s, p4/M, z15.s, z19.s\n"
-    "smin z14.s, p4/M, z14.s, z19.s\n"
-    "smin z13.s, p4/M, z13.s, z19.s\n"
-    "smin z12.s, p4/M, z12.s, z19.s\n"
+    "mov z17.s, #0x0\n"
+    "smax z15.s, p0/M, z15.s, z17.s\n"
+    "smax z14.s, p0/M, z14.s, z17.s\n"
+    "mov z16.s, #0xff\n"
+    "smax z13.s, p0/M, z13.s, z17.s\n"
+    "smax z12.s, p0/M, z12.s, z17.s\n"
+    "smin z15.s, p0/M, z15.s, z16.s\n"
+    "smin z14.s, p0/M, z14.s, z16.s\n"
     "trn1 z17.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z16.s\n"
+    "smin z12.s, p0/M, z12.s, z16.s\n"
     "trn1 z16.h, z13.h, z12.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
-    "incb x26\n"
-    "whilelt p3.b, x26, %x[n_channels]\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [accumulator_init] "r" (&accumulator_init), [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [outptr] "r" (outptr), [quant_params] "r" (&qp), [right_shift] "r" (&right_shift)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst.hpp
index c3c0edd0d5..eece6c0578 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_u8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
 
-struct sve_u8q_nhwc_max_generic_depthfirst
+struct sve_u8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = sve_u8q_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
   sve_u8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_u8q_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
index c1c1d29613..94522cdaaa 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,9 @@
 
 #include "pooling.hpp"
 #include <cstdint>
+#include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -41,376 +42,375 @@ void sve_u8q_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x28, #0x0\n"
-    "cntb x27\n"
-    "cntb x26, ALL, MUL #2\n"
-    "cntb x25, ALL, MUL #3\n"
+    "mov x9, #0x0\n"
+    "cntb x28\n"
+    "cntb x27, ALL, MUL #2\n"
+    "cntb x26, ALL, MUL #3\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "whilelt p3.b, x28, %x[n_channels]\n"
     "whilelt p2.b, x27, %x[n_channels]\n"
     "whilelt p1.b, x26, %x[n_channels]\n"
-    "whilelt p0.b, x25, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
-    "mov z10.b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "mov z9.b, #0x0\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
     "mov z8.b, #0x0\n"
     "mov z7.b, #0x0\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "ld1b { z30.b }, p2/Z, [x22, x27]\n"
-    "ld1b { z22.b }, p2/Z, [x21, x27]\n"
-    "ld1b { z29.b }, p2/Z, [x20, x27]\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "ld1b { z27.b }, p1/Z, [x22, x26]\n"
-    "ld1b { z21.b }, p1/Z, [x21, x26]\n"
-    "ld1b { z26.b }, p1/Z, [x20, x26]\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "ld1b { z25.b }, p0/Z, [x22, x25]\n"
-    "ld1b { z20.b }, p0/Z, [x21, x25]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z6.b, #0x0\n"
+    "mov z5.b, #0x0\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+    "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "movprfx z18, z31\n umax z18.b, p4/M, z18.b, z30.b\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "umax z22.b, p4/M, z22.b, z29.b\n"
-    "movprfx z17, z28\n umax z17.b, p4/M, z17.b, z27.b\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "umax z21.b, p4/M, z21.b, z26.b\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "umax z16.b, p4/M, z16.b, z25.b\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
-    "umax z20.b, p4/M, z20.b, z24.b\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "umax z19.b, p4/M, z19.b, z23.b\n"
-    "ld1b { z30.b }, p2/Z, [x22, x27]\n"
-    "umax z18.b, p4/M, z18.b, z22.b\n"
-    "ld1b { z22.b }, p2/Z, [x21, x27]\n"
-    "umax z17.b, p4/M, z17.b, z21.b\n"
-    "ld1b { z29.b }, p2/Z, [x20, x27]\n"
-    "umax z16.b, p4/M, z16.b, z20.b\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "umax z10.b, p4/M, z10.b, z19.b\n"
-    "ld1b { z27.b }, p1/Z, [x22, x26]\n"
-    "umax z9.b, p4/M, z9.b, z18.b\n"
-    "ld1b { z21.b }, p1/Z, [x21, x26]\n"
-    "umax z8.b, p4/M, z8.b, z17.b\n"
-    "ld1b { z26.b }, p1/Z, [x20, x26]\n"
-    "umax z7.b, p4/M, z7.b, z16.b\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "ld1b { z25.b }, p0/Z, [x22, x25]\n"
-    "ld1b { z20.b }, p0/Z, [x21, x25]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+    "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
+    "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
+    "umax z22.b, p0/M, z22.b, z30.b\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
+    "umax z21.b, p0/M, z21.b, z27.b\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+    "movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
+    "umax z20.b, p0/M, z20.b, z24.b\n"
+    "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+    "umax z19.b, p0/M, z19.b, z23.b\n"
+    "umax z18.b, p0/M, z18.b, z22.b\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+    "umax z17.b, p0/M, z17.b, z21.b\n"
+    "umax z16.b, p0/M, z16.b, z20.b\n"
+    "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "umax z8.b, p0/M, z8.b, z19.b\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+    "umax z7.b, p0/M, z7.b, z18.b\n"
+    "umax z6.b, p0/M, z6.b, z17.b\n"
+    "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
-    "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
-    "movprfx z18, z31\n umax z18.b, p4/M, z18.b, z30.b\n"
-    "umax z22.b, p4/M, z22.b, z29.b\n"
-    "movprfx z17, z28\n umax z17.b, p4/M, z17.b, z27.b\n"
-    "umax z21.b, p4/M, z21.b, z26.b\n"
-    "umax z16.b, p4/M, z16.b, z25.b\n"
-    "umax z20.b, p4/M, z20.b, z24.b\n"
-    "umax z19.b, p4/M, z19.b, z23.b\n"
-    "umax z18.b, p4/M, z18.b, z22.b\n"
-    "umax z17.b, p4/M, z17.b, z21.b\n"
-    "umax z16.b, p4/M, z16.b, z20.b\n"
-    "umax z10.b, p4/M, z10.b, z19.b\n"
-    "umax z9.b, p4/M, z9.b, z18.b\n"
-    "umax z8.b, p4/M, z8.b, z17.b\n"
-    "umax z7.b, p4/M, z7.b, z16.b\n"
+    "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
+    "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
+    "movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
+    "umax z22.b, p0/M, z22.b, z30.b\n"
+    "movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
+    "umax z21.b, p0/M, z21.b, z27.b\n"
+    "movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
+    "umax z20.b, p0/M, z20.b, z24.b\n"
+    "umax z19.b, p0/M, z19.b, z23.b\n"
+    "umax z18.b, p0/M, z18.b, z22.b\n"
+    "umax z17.b, p0/M, z17.b, z21.b\n"
+    "umax z16.b, p0/M, z16.b, z20.b\n"
+    "umax z8.b, p0/M, z8.b, z19.b\n"
+    "umax z7.b, p0/M, z7.b, z18.b\n"
+    "umax z6.b, p0/M, z6.b, z17.b\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "umax z10.b, p4/M, z10.b, z3.b\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "umax z9.b, p4/M, z9.b, z31.b\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "umax z8.b, p4/M, z8.b, z28.b\n"
-    "umax z7.b, p4/M, z7.b, z16.b\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "umax z8.b, p0/M, z8.b, z16.b\n"
+    "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+    "umax z7.b, p0/M, z7.b, z17.b\n"
+    "umax z6.b, p0/M, z6.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "mov z6.s, #0x0\n"
-    "add x19, %x[quant_params], %[offsetof_qp_input_offset]\n"
-    "ld1rw { z5.s }, p4/Z, [x19]\n"
-    "mov z4.s, #0xff\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    ".inst 0x4508a951  // ushllb z17.h, z10.b, #0x0\n"
-    "ld1rw { z3.s }, p4/Z, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
-    ".inst 0x4508ad50  // ushllt z16.h, z10.b, #0x0\n"
-    "ld1rw { z2.s }, p4/Z, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    ".inst 0x4508a937  // ushllb z23.h, z9.b, #0x0\n"
-    "ld1rw { z1.s }, p4/Z, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    ".inst 0x4508ad36  // ushllt z22.h, z9.b, #0x0\n"
-    "ld1rw { z0.s }, p4/Z, [x19]\n"
-    ".inst 0x4508a912  // ushllb z18.h, z8.b, #0x0\n"
-    ".inst 0x4508ad15  // ushllt z21.h, z8.b, #0x0\n"
-    ".inst 0x4508a8f4  // ushllb z20.h, z7.b, #0x0\n"
-    ".inst 0x4508acf3  // ushllt z19.h, z7.b, #0x0\n"
-    "neg z5.s, p4/M, z5.s\n"
-    ".inst 0x459140bf  // saddwb z31.s, z5.s, z17.h\n"
-    ".inst 0x459144b1  // saddwt z17.s, z5.s, z17.h\n"
-    ".inst 0x459040be  // saddwb z30.s, z5.s, z16.h\n"
-    ".inst 0x459044b0  // saddwt z16.s, z5.s, z16.h\n"
-    ".inst 0x459740bd  // saddwb z29.s, z5.s, z23.h\n"
-    ".inst 0x459744bc  // saddwt z28.s, z5.s, z23.h\n"
-    ".inst 0x459640bb  // saddwb z27.s, z5.s, z22.h\n"
-    ".inst 0x459644ba  // saddwt z26.s, z5.s, z22.h\n"
-    ".inst 0x459240b9  // saddwb z25.s, z5.s, z18.h\n"
-    ".inst 0x459244b2  // saddwt z18.s, z5.s, z18.h\n"
-    ".inst 0x459540b8  // saddwb z24.s, z5.s, z21.h\n"
-    ".inst 0x459544b7  // saddwt z23.s, z5.s, z21.h\n"
-    ".inst 0x459440b6  // saddwb z22.s, z5.s, z20.h\n"
-    ".inst 0x459444b5  // saddwt z21.s, z5.s, z20.h\n"
-    ".inst 0x459340b4  // saddwb z20.s, z5.s, z19.h\n"
-    ".inst 0x459344b3  // saddwt z19.s, z5.s, z19.h\n"
-    ".inst 0x4482905f  // srshl z31.s, p4/M, z31.s, z2.s\n"
-    ".inst 0x44829051  // srshl z17.s, p4/M, z17.s, z2.s\n"
-    ".inst 0x4482905e  // srshl z30.s, p4/M, z30.s, z2.s\n"
-    ".inst 0x44829050  // srshl z16.s, p4/M, z16.s, z2.s\n"
-    ".inst 0x04a377ff  // sqrdmulh z31.s, z31.s, z3.s\n"
-    ".inst 0x04a37631  // sqrdmulh z17.s, z17.s, z3.s\n"
-    ".inst 0x04a377de  // sqrdmulh z30.s, z30.s, z3.s\n"
-    ".inst 0x04a37610  // sqrdmulh z16.s, z16.s, z3.s\n"
-    ".inst 0x4482903f  // srshl z31.s, p4/M, z31.s, z1.s\n"
-    ".inst 0x44829031  // srshl z17.s, p4/M, z17.s, z1.s\n"
-    ".inst 0x4482903e  // srshl z30.s, p4/M, z30.s, z1.s\n"
-    ".inst 0x44829030  // srshl z16.s, p4/M, z16.s, z1.s\n"
-    "add z31.s, z31.s, z0.s\n"
-    "add z17.s, z17.s, z0.s\n"
-    "add z30.s, z30.s, z0.s\n"
-    "add z16.s, z16.s, z0.s\n"
-    ".inst 0x4482905d  // srshl z29.s, p4/M, z29.s, z2.s\n"
-    ".inst 0x4482905c  // srshl z28.s, p4/M, z28.s, z2.s\n"
-    ".inst 0x4482905b  // srshl z27.s, p4/M, z27.s, z2.s\n"
-    ".inst 0x4482905a  // srshl z26.s, p4/M, z26.s, z2.s\n"
-    ".inst 0x04a377bd  // sqrdmulh z29.s, z29.s, z3.s\n"
-    ".inst 0x04a3779c  // sqrdmulh z28.s, z28.s, z3.s\n"
-    ".inst 0x04a3777b  // sqrdmulh z27.s, z27.s, z3.s\n"
-    ".inst 0x04a3775a  // sqrdmulh z26.s, z26.s, z3.s\n"
-    ".inst 0x4482903d  // srshl z29.s, p4/M, z29.s, z1.s\n"
-    ".inst 0x4482903c  // srshl z28.s, p4/M, z28.s, z1.s\n"
-    ".inst 0x4482903b  // srshl z27.s, p4/M, z27.s, z1.s\n"
-    ".inst 0x4482903a  // srshl z26.s, p4/M, z26.s, z1.s\n"
-    "add z29.s, z29.s, z0.s\n"
-    "add z28.s, z28.s, z0.s\n"
-    "add z27.s, z27.s, z0.s\n"
-    "add z26.s, z26.s, z0.s\n"
-    ".inst 0x44829059  // srshl z25.s, p4/M, z25.s, z2.s\n"
-    ".inst 0x44829052  // srshl z18.s, p4/M, z18.s, z2.s\n"
-    "smax z31.s, p4/M, z31.s, z6.s\n"
-    "smax z17.s, p4/M, z17.s, z6.s\n"
-    ".inst 0x04a37739  // sqrdmulh z25.s, z25.s, z3.s\n"
-    ".inst 0x04a37652  // sqrdmulh z18.s, z18.s, z3.s\n"
-    "smin z31.s, p4/M, z31.s, z4.s\n"
-    "smin z17.s, p4/M, z17.s, z4.s\n"
-    ".inst 0x44829039  // srshl z25.s, p4/M, z25.s, z1.s\n"
-    ".inst 0x44829032  // srshl z18.s, p4/M, z18.s, z1.s\n"
-    "smax z30.s, p4/M, z30.s, z6.s\n"
-    "trn1 z17.h, z31.h, z17.h\n"
-    "add z25.s, z25.s, z0.s\n"
-    "add z18.s, z18.s, z0.s\n"
-    ".inst 0x44829058  // srshl z24.s, p4/M, z24.s, z2.s\n"
-    ".inst 0x44829057  // srshl z23.s, p4/M, z23.s, z2.s\n"
-    "smin z30.s, p4/M, z30.s, z4.s\n"
-    "smax z16.s, p4/M, z16.s, z6.s\n"
-    ".inst 0x04a37718  // sqrdmulh z24.s, z24.s, z3.s\n"
-    ".inst 0x04a376f7  // sqrdmulh z23.s, z23.s, z3.s\n"
-    "smax z29.s, p4/M, z29.s, z6.s\n"
-    "smin z16.s, p4/M, z16.s, z4.s\n"
-    ".inst 0x44829038  // srshl z24.s, p4/M, z24.s, z1.s\n"
-    ".inst 0x44829037  // srshl z23.s, p4/M, z23.s, z1.s\n"
-    "smin z29.s, p4/M, z29.s, z4.s\n"
-    "trn1 z16.h, z30.h, z16.h\n"
-    "add z24.s, z24.s, z0.s\n"
-    "add z23.s, z23.s, z0.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+    "ld1rw { z3.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a911  // ushllb z17.h, z8.b, #0x0\n"
+    ".inst 0x4508ad18  // ushllt z24.h, z8.b, #0x0\n"
+    ".inst 0x4508a8f7  // ushllb z23.h, z7.b, #0x0\n"
+    ".inst 0x4508acf6  // ushllt z22.h, z7.b, #0x0\n"
+    "neg z3.s, p0/M, z3.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    ".inst 0x4508a8d5  // ushllb z21.h, z6.b, #0x0\n"
+    ".inst 0x4508acd4  // ushllt z20.h, z6.b, #0x0\n"
+    "ld1rw { z2.s }, p0/Z, [x20]\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    ".inst 0x4508a8b3  // ushllb z19.h, z5.b, #0x0\n"
+    ".inst 0x4508acb0  // ushllt z16.h, z5.b, #0x0\n"
+    "ld1rw { z18.s }, p0/Z, [x20]\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    ".inst 0x45914061  // saddwb z1.s, z3.s, z17.h\n"
+    ".inst 0x45914471  // saddwt z17.s, z3.s, z17.h\n"
+    ".inst 0x44828041  // srshl z1.s, p0/M, z1.s, z2.s\n"
+    ".inst 0x44828051  // srshl z17.s, p0/M, z17.s, z2.s\n"
+    ".inst 0x45984060  // saddwb z0.s, z3.s, z24.h\n"
+    ".inst 0x4598447f  // saddwt z31.s, z3.s, z24.h\n"
+    ".inst 0x44828040  // srshl z0.s, p0/M, z0.s, z2.s\n"
+    ".inst 0x4482805f  // srshl z31.s, p0/M, z31.s, z2.s\n"
+    ".inst 0x4597407e  // saddwb z30.s, z3.s, z23.h\n"
+    ".inst 0x4597447d  // saddwt z29.s, z3.s, z23.h\n"
+    ".inst 0x4482805e  // srshl z30.s, p0/M, z30.s, z2.s\n"
+    ".inst 0x4482805d  // srshl z29.s, p0/M, z29.s, z2.s\n"
+    ".inst 0x4596407c  // saddwb z28.s, z3.s, z22.h\n"
+    ".inst 0x4596447b  // saddwt z27.s, z3.s, z22.h\n"
+    ".inst 0x4482805c  // srshl z28.s, p0/M, z28.s, z2.s\n"
+    ".inst 0x4482805b  // srshl z27.s, p0/M, z27.s, z2.s\n"
+    ".inst 0x4595407a  // saddwb z26.s, z3.s, z21.h\n"
+    ".inst 0x45954479  // saddwt z25.s, z3.s, z21.h\n"
+    ".inst 0x4482805a  // srshl z26.s, p0/M, z26.s, z2.s\n"
+    ".inst 0x44828059  // srshl z25.s, p0/M, z25.s, z2.s\n"
+    ".inst 0x45944078  // saddwb z24.s, z3.s, z20.h\n"
+    ".inst 0x45944477  // saddwt z23.s, z3.s, z20.h\n"
+    ".inst 0x44828058  // srshl z24.s, p0/M, z24.s, z2.s\n"
+    ".inst 0x44828057  // srshl z23.s, p0/M, z23.s, z2.s\n"
+    ".inst 0x45934076  // saddwb z22.s, z3.s, z19.h\n"
+    ".inst 0x45934475  // saddwt z21.s, z3.s, z19.h\n"
+    ".inst 0x44828056  // srshl z22.s, p0/M, z22.s, z2.s\n"
+    ".inst 0x44828055  // srshl z21.s, p0/M, z21.s, z2.s\n"
+    ".inst 0x45904074  // saddwb z20.s, z3.s, z16.h\n"
+    ".inst 0x45904473  // saddwt z19.s, z3.s, z16.h\n"
+    ".inst 0x44828054  // srshl z20.s, p0/M, z20.s, z2.s\n"
+    ".inst 0x44828053  // srshl z19.s, p0/M, z19.s, z2.s\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x04b27421  // sqrdmulh z1.s, z1.s, z18.s\n"
+    ".inst 0x04b27631  // sqrdmulh z17.s, z17.s, z18.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    ".inst 0x04b27400  // sqrdmulh z0.s, z0.s, z18.s\n"
+    ".inst 0x04b277ff  // sqrdmulh z31.s, z31.s, z18.s\n"
+    ".inst 0x44828201  // srshl z1.s, p0/M, z1.s, z16.s\n"
+    ".inst 0x44828211  // srshl z17.s, p0/M, z17.s, z16.s\n"
+    ".inst 0x04b277de  // sqrdmulh z30.s, z30.s, z18.s\n"
+    ".inst 0x04b277bd  // sqrdmulh z29.s, z29.s, z18.s\n"
+    ".inst 0x44828200  // srshl z0.s, p0/M, z0.s, z16.s\n"
+    ".inst 0x4482821f  // srshl z31.s, p0/M, z31.s, z16.s\n"
+    ".inst 0x04b2779c  // sqrdmulh z28.s, z28.s, z18.s\n"
+    ".inst 0x04b2777b  // sqrdmulh z27.s, z27.s, z18.s\n"
+    ".inst 0x4482821e  // srshl z30.s, p0/M, z30.s, z16.s\n"
+    ".inst 0x4482821d  // srshl z29.s, p0/M, z29.s, z16.s\n"
+    ".inst 0x04b2775a  // sqrdmulh z26.s, z26.s, z18.s\n"
+    ".inst 0x04b27739  // sqrdmulh z25.s, z25.s, z18.s\n"
+    ".inst 0x4482821c  // srshl z28.s, p0/M, z28.s, z16.s\n"
+    ".inst 0x4482821b  // srshl z27.s, p0/M, z27.s, z16.s\n"
+    ".inst 0x04b27718  // sqrdmulh z24.s, z24.s, z18.s\n"
+    ".inst 0x04b276f7  // sqrdmulh z23.s, z23.s, z18.s\n"
+    ".inst 0x4482821a  // srshl z26.s, p0/M, z26.s, z16.s\n"
+    ".inst 0x44828219  // srshl z25.s, p0/M, z25.s, z16.s\n"
+    ".inst 0x04b276d6  // sqrdmulh z22.s, z22.s, z18.s\n"
+    ".inst 0x04b276b5  // sqrdmulh z21.s, z21.s, z18.s\n"
+    ".inst 0x44828218  // srshl z24.s, p0/M, z24.s, z16.s\n"
+    ".inst 0x44828217  // srshl z23.s, p0/M, z23.s, z16.s\n"
+    ".inst 0x04b27694  // sqrdmulh z20.s, z20.s, z18.s\n"
+    ".inst 0x04b27673  // sqrdmulh z19.s, z19.s, z18.s\n"
+    ".inst 0x44828216  // srshl z22.s, p0/M, z22.s, z16.s\n"
+    ".inst 0x44828215  // srshl z21.s, p0/M, z21.s, z16.s\n"
+    ".inst 0x44828214  // srshl z20.s, p0/M, z20.s, z16.s\n"
+    ".inst 0x44828213  // srshl z19.s, p0/M, z19.s, z16.s\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    "add z1.s, z1.s, z16.s\n"
+    "add z17.s, z17.s, z16.s\n"
+    "add z0.s, z0.s, z16.s\n"
+    "add z31.s, z31.s, z16.s\n"
+    "add z30.s, z30.s, z16.s\n"
+    "add z29.s, z29.s, z16.s\n"
+    "add z28.s, z28.s, z16.s\n"
+    "add z27.s, z27.s, z16.s\n"
+    "add z26.s, z26.s, z16.s\n"
+    "add z25.s, z25.s, z16.s\n"
+    "add z24.s, z24.s, z16.s\n"
+    "add z23.s, z23.s, z16.s\n"
+    "add z22.s, z22.s, z16.s\n"
+    "add z21.s, z21.s, z16.s\n"
+    "add z20.s, z20.s, z16.s\n"
+    "add z19.s, z19.s, z16.s\n"
+    "mov z16.s, #0x0\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z17.s, p0/M, z17.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smax z31.s, p0/M, z31.s, z16.s\n"
+    "mov z18.s, #0xff\n"
+    "smax z30.s, p0/M, z30.s, z16.s\n"
+    "smax z29.s, p0/M, z29.s, z16.s\n"
+    "smax z28.s, p0/M, z28.s, z16.s\n"
+    "smax z27.s, p0/M, z27.s, z16.s\n"
+    "smax z26.s, p0/M, z26.s, z16.s\n"
+    "smax z25.s, p0/M, z25.s, z16.s\n"
+    "smax z24.s, p0/M, z24.s, z16.s\n"
+    "smax z23.s, p0/M, z23.s, z16.s\n"
+    "smax z22.s, p0/M, z22.s, z16.s\n"
+    "smax z21.s, p0/M, z21.s, z16.s\n"
+    "smax z20.s, p0/M, z20.s, z16.s\n"
+    "smax z19.s, p0/M, z19.s, z16.s\n"
+    "smin z1.s, p0/M, z1.s, z18.s\n"
+    "smin z17.s, p0/M, z17.s, z18.s\n"
+    "trn1 z17.h, z1.h, z17.h\n"
+    "smin z0.s, p0/M, z0.s, z18.s\n"
+    "smin z31.s, p0/M, z31.s, z18.s\n"
+    "trn1 z16.h, z0.h, z31.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x28]\n"
-    ".inst 0x44829056  // srshl z22.s, p4/M, z22.s, z2.s\n"
-    "incb x28, ALL, MUL #4\n"
-    ".inst 0x44829055  // srshl z21.s, p4/M, z21.s, z2.s\n"
-    ".inst 0x44829054  // srshl z20.s, p4/M, z20.s, z2.s\n"
-    ".inst 0x44829053  // srshl z19.s, p4/M, z19.s, z2.s\n"
-    "smax z28.s, p4/M, z28.s, z6.s\n"
-    ".inst 0x04a376d6  // sqrdmulh z22.s, z22.s, z3.s\n"
-    ".inst 0x04a376b5  // sqrdmulh z21.s, z21.s, z3.s\n"
-    ".inst 0x04a37694  // sqrdmulh z20.s, z20.s, z3.s\n"
-    ".inst 0x04a37673  // sqrdmulh z19.s, z19.s, z3.s\n"
-    ".inst 0x44829036  // srshl z22.s, p4/M, z22.s, z1.s\n"
-    ".inst 0x44829035  // srshl z21.s, p4/M, z21.s, z1.s\n"
-    ".inst 0x44829034  // srshl z20.s, p4/M, z20.s, z1.s\n"
-    ".inst 0x44829033  // srshl z19.s, p4/M, z19.s, z1.s\n"
-    "add z22.s, z22.s, z0.s\n"
-    "add z21.s, z21.s, z0.s\n"
-    "add z20.s, z20.s, z0.s\n"
-    "add z19.s, z19.s, z0.s\n"
-    "smax z27.s, p4/M, z27.s, z6.s\n"
-    "smax z26.s, p4/M, z26.s, z6.s\n"
-    "smax z25.s, p4/M, z25.s, z6.s\n"
-    "smin z28.s, p4/M, z28.s, z4.s\n"
-    "smin z27.s, p4/M, z27.s, z4.s\n"
-    "smin z26.s, p4/M, z26.s, z4.s\n"
-    "smin z25.s, p4/M, z25.s, z4.s\n"
-    "trn1 z17.h, z29.h, z28.h\n"
-    "smax z18.s, p4/M, z18.s, z6.s\n"
-    "trn1 z16.h, z27.h, z26.h\n"
-    "smax z24.s, p4/M, z24.s, z6.s\n"
+    "smin z30.s, p0/M, z30.s, z18.s\n"
+    "smin z29.s, p0/M, z29.s, z18.s\n"
+    "trn1 z17.h, z30.h, z29.h\n"
+    "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+    "smin z28.s, p0/M, z28.s, z18.s\n"
+    "smin z27.s, p0/M, z27.s, z18.s\n"
+    "trn1 z16.h, z28.h, z27.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p2, [%x[outptr], x27]\n"
-    "smin z18.s, p4/M, z18.s, z4.s\n"
-    "incb x27, ALL, MUL #4\n"
-    "smin z24.s, p4/M, z24.s, z4.s\n"
-    "smax z23.s, p4/M, z23.s, z6.s\n"
-    "smax z22.s, p4/M, z22.s, z6.s\n"
-    "smax z21.s, p4/M, z21.s, z6.s\n"
-    "trn1 z18.h, z25.h, z18.h\n"
-    "smin z23.s, p4/M, z23.s, z4.s\n"
-    "smin z22.s, p4/M, z22.s, z4.s\n"
-    "smin z21.s, p4/M, z21.s, z4.s\n"
-    "smax z20.s, p4/M, z20.s, z6.s\n"
+    "smin z26.s, p0/M, z26.s, z18.s\n"
+    "smin z25.s, p0/M, z25.s, z18.s\n"
+    "trn1 z17.h, z26.h, z25.h\n"
+    "st1b { z16.b }, p3, [%x[outptr], x28]\n"
+    "smin z24.s, p0/M, z24.s, z18.s\n"
+    "smin z23.s, p0/M, z23.s, z18.s\n"
     "trn1 z16.h, z24.h, z23.h\n"
-    "smax z19.s, p4/M, z19.s, z6.s\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "smin z22.s, p0/M, z22.s, z18.s\n"
+    "smin z21.s, p0/M, z21.s, z18.s\n"
     "trn1 z17.h, z22.h, z21.h\n"
-    "trn1 z16.b, z18.b, z16.b\n"
-    "st1b { z16.b }, p1, [%x[outptr], x26]\n"
-    "smin z20.s, p4/M, z20.s, z4.s\n"
-    "incb x26, ALL, MUL #4\n"
-    "smin z19.s, p4/M, z19.s, z4.s\n"
+    "st1b { z16.b }, p2, [%x[outptr], x27]\n"
+    "smin z20.s, p0/M, z20.s, z18.s\n"
+    "smin z19.s, p0/M, z19.s, z18.s\n"
     "trn1 z16.h, z20.h, z19.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p0, [%x[outptr], x25]\n"
-    "incb x25, ALL, MUL #4\n"
-    "whilelt p0.b, x25, %x[n_channels]\n"
+    "st1b { z16.b }, p1, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "incb x9, ALL, MUL #4\n"
+    "incb x28, ALL, MUL #4\n"
+    "incb x27, ALL, MUL #4\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.b, x28, %x[n_channels]\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov z10.b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "umax z19.b, p4/M, z19.b, z23.b\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "umax z10.b, p4/M, z10.b, z19.b\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+    "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+    "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "umax z16.b, p0/M, z16.b, z17.b\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "umax z8.b, p0/M, z8.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
-    "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
-    "umax z19.b, p4/M, z19.b, z23.b\n"
-    "umax z10.b, p4/M, z10.b, z19.b\n"
+    "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+    "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+    "umax z16.b, p0/M, z16.b, z17.b\n"
+    "umax z8.b, p0/M, z8.b, z16.b\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "umax z10.b, p4/M, z10.b, z3.b\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "umax z8.b, p0/M, z8.b, z16.b\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "mov z6.s, #0x0\n"
-    "add x19, %x[quant_params], %[offsetof_qp_input_offset]\n"
-    "ld1rw { z5.s }, p4/Z, [x19]\n"
-    "mov z4.s, #0xff\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    ".inst 0x4508a951  // ushllb z17.h, z10.b, #0x0\n"
-    "ld1rw { z3.s }, p4/Z, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
-    ".inst 0x4508ad50  // ushllt z16.h, z10.b, #0x0\n"
-    "ld1rw { z2.s }, p4/Z, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    "neg z5.s, p4/M, z5.s\n"
-    "ld1rw { z1.s }, p4/Z, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    ".inst 0x459140bf  // saddwb z31.s, z5.s, z17.h\n"
-    "ld1rw { z0.s }, p4/Z, [x19]\n"
-    ".inst 0x459144b1  // saddwt z17.s, z5.s, z17.h\n"
-    ".inst 0x459040be  // saddwb z30.s, z5.s, z16.h\n"
-    ".inst 0x459044b0  // saddwt z16.s, z5.s, z16.h\n"
-    ".inst 0x4482905f  // srshl z31.s, p4/M, z31.s, z2.s\n"
-    ".inst 0x44829051  // srshl z17.s, p4/M, z17.s, z2.s\n"
-    ".inst 0x4482905e  // srshl z30.s, p4/M, z30.s, z2.s\n"
-    ".inst 0x44829050  // srshl z16.s, p4/M, z16.s, z2.s\n"
-    ".inst 0x04a377ff  // sqrdmulh z31.s, z31.s, z3.s\n"
-    ".inst 0x04a37631  // sqrdmulh z17.s, z17.s, z3.s\n"
-    ".inst 0x04a377de  // sqrdmulh z30.s, z30.s, z3.s\n"
-    ".inst 0x04a37610  // sqrdmulh z16.s, z16.s, z3.s\n"
-    ".inst 0x4482903f  // srshl z31.s, p4/M, z31.s, z1.s\n"
-    ".inst 0x44829031  // srshl z17.s, p4/M, z17.s, z1.s\n"
-    ".inst 0x4482903e  // srshl z30.s, p4/M, z30.s, z1.s\n"
-    ".inst 0x44829030  // srshl z16.s, p4/M, z16.s, z1.s\n"
-    "add z31.s, z31.s, z0.s\n"
-    "add z17.s, z17.s, z0.s\n"
-    "add z30.s, z30.s, z0.s\n"
-    "add z16.s, z16.s, z0.s\n"
-    "smax z31.s, p4/M, z31.s, z6.s\n"
-    "smax z17.s, p4/M, z17.s, z6.s\n"
-    "smax z30.s, p4/M, z30.s, z6.s\n"
-    "smax z16.s, p4/M, z16.s, z6.s\n"
-    "smin z31.s, p4/M, z31.s, z4.s\n"
-    "smin z17.s, p4/M, z17.s, z4.s\n"
-    "smin z30.s, p4/M, z30.s, z4.s\n"
-    "smin z16.s, p4/M, z16.s, z4.s\n"
-    "trn1 z17.h, z31.h, z17.h\n"
-    "trn1 z16.h, z30.h, z16.h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+    "ld1rw { z18.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a911  // ushllb z17.h, z8.b, #0x0\n"
+    ".inst 0x4508ad10  // ushllt z16.h, z8.b, #0x0\n"
+    "neg z18.s, p0/M, z18.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    ".inst 0x45914255  // saddwb z21.s, z18.s, z17.h\n"
+    ".inst 0x45914654  // saddwt z20.s, z18.s, z17.h\n"
+    ".inst 0x45904253  // saddwb z19.s, z18.s, z16.h\n"
+    ".inst 0x45904652  // saddwt z18.s, z18.s, z16.h\n"
+    "ld1rw { z17.s }, p0/Z, [x20]\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x44828235  // srshl z21.s, p0/M, z21.s, z17.s\n"
+    ".inst 0x44828234  // srshl z20.s, p0/M, z20.s, z17.s\n"
+    ".inst 0x04b076b5  // sqrdmulh z21.s, z21.s, z16.s\n"
+    ".inst 0x44828233  // srshl z19.s, p0/M, z19.s, z17.s\n"
+    ".inst 0x44828232  // srshl z18.s, p0/M, z18.s, z17.s\n"
+    ".inst 0x04b07694  // sqrdmulh z20.s, z20.s, z16.s\n"
+    ".inst 0x04b07673  // sqrdmulh z19.s, z19.s, z16.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1rw { z17.s }, p0/Z, [x20]\n"
+    ".inst 0x04b07652  // sqrdmulh z18.s, z18.s, z16.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    ".inst 0x44828235  // srshl z21.s, p0/M, z21.s, z17.s\n"
+    ".inst 0x44828234  // srshl z20.s, p0/M, z20.s, z17.s\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    "add z21.s, z21.s, z16.s\n"
+    ".inst 0x44828233  // srshl z19.s, p0/M, z19.s, z17.s\n"
+    ".inst 0x44828232  // srshl z18.s, p0/M, z18.s, z17.s\n"
+    "add z20.s, z20.s, z16.s\n"
+    "add z19.s, z19.s, z16.s\n"
+    "add z18.s, z18.s, z16.s\n"
+    "mov z16.s, #0x0\n"
+    "smax z21.s, p0/M, z21.s, z16.s\n"
+    "smax z20.s, p0/M, z20.s, z16.s\n"
+    "smax z19.s, p0/M, z19.s, z16.s\n"
+    "smax z18.s, p0/M, z18.s, z16.s\n"
+    "mov z16.s, #0xff\n"
+    "smin z21.s, p0/M, z21.s, z16.s\n"
+    "smin z20.s, p0/M, z20.s, z16.s\n"
+    "trn1 z17.h, z21.h, z20.h\n"
+    "smin z19.s, p0/M, z19.s, z16.s\n"
+    "smin z18.s, p0/M, z18.s, z16.s\n"
+    "trn1 z16.h, z19.h, z18.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x28]\n"
-    "incb x28\n"
-    "whilelt p3.b, x28, %x[n_channels]\n"
+    "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+    "incb x9\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_input_offset] "I" (offsetof(Requantize32, input_offset)), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [outptr] "r" (outptr), [quant_params] "r" (&qp)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
index ad95207fb3..1ca478513c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,288 +24,262 @@
 
 #pragma once
 
-#include "pool_common.hpp"
+#include "depthfirst_driver.hpp"
+#include "src/core/NEON/kernels/arm_conv/addressing.hpp"
 #include "utils.hpp"
-
-#include "arm_compute/core/Types.h"
+#if !defined(_WIN64) && !defined(__OpenBSD__)
+#include <alloca.h>
+#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
 #include <limits>
 
 namespace arm_conv {
 namespace pooling {
 
-template <class strategy>
-class PoolingDepthfirst : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type>
+template <typename TInput, typename TOutput>
+class DepthfirstStrategy : public IDepthfirstStrategy
 {
-  using TInput = typename strategy::operand_type;
-  using TOutput = typename strategy::return_type;
-
-  const PoolingArgs m_args;  // Copy of arguments
+  unsigned int input_rows, input_cols, output_rows, output_cols;
 
-  constexpr static unsigned int input_rows(void)
+  public:
+  DepthfirstStrategy(unsigned int window_rows, unsigned int window_cols,
+                     unsigned int stride_rows, unsigned int stride_cols,
+                     unsigned int output_rows, unsigned int output_cols)
+  : input_rows(output_rows + (window_rows - 1) * stride_rows),
+    input_cols(output_cols + (window_cols - 1) * stride_cols),
+    output_rows(output_rows), output_cols(output_cols)
   {
-    return (strategy::out_rows() - 1)*strategy::stride_rows() + strategy::pool_rows();
   }
 
-  constexpr static unsigned int input_cols(void)
-  {
-    return (strategy::out_cols() - 1)*strategy::stride_cols() + strategy::pool_cols();
-  }
+  unsigned int get_input_rows() const override { return input_rows; }
+  unsigned int get_input_cols() const override { return input_cols; }
+  unsigned int get_output_rows() const override { return output_rows; }
+  unsigned int get_output_cols() const override { return output_cols; }
+
+  typedef void (*KernelType)(
+    unsigned int n_channels,
+    const TInput *const *,
+    TOutput *const *,
+    bool exclude_padding,
+    unsigned int pad_left,
+    unsigned int pad_top,
+    unsigned int pad_right,
+    unsigned int pad_bottom
+  );
+  virtual KernelType get_kernel(void) const = 0;
+};
+
+
+struct WorkingSpace
+{
+  void *input_buffer;
+  void *output_buffer;
+};
+
 
+template <typename TInput, typename TOutput=TInput, class OutputStage=Nothing>
+class PoolingDepthfirst : public DepthfirstDriver<TInput, TOutput>
+{
   size_t sizeof_input_buffer(void) const
   {
-    return sizeof(TInput) * m_args.n_channels;
+    return sizeof(TInput) * this->m_args.n_channels;
   }
 
   size_t sizeof_output_buffer(void) const
   {
-    return sizeof(TOutput) * m_args.n_channels;
+    return sizeof(TOutput) * this->m_args.n_channels;
   }
 
-  public:
-  PoolingDepthfirst(const PoolingArgs &args) : m_args(args)
+  protected:
+  /* Compute the amount of working space required for a single thread. */
+  size_t get_working_size_per_thread() const override
   {
+    return sizeof(WorkingSpace) + this->m_args.n_channels * (sizeof(TInput) + sizeof(TOutput));
   }
 
-  PoolingDepthfirst(PoolingDepthfirst &) = delete;
-  PoolingDepthfirst &operator=(PoolingDepthfirst &) = delete;
-
-  size_t get_working_size(unsigned int num_threads) const override
+  /* Initialise the working space for a thread. */
+  void initialise_working_space(void *raw_ws) const override
   {
-    // We require a channel-length vector of input padding values
-    // (to be shared amongst all threads) and (for each thread) a
-    // channel-length vector in which to dump surplus output.
-    return sizeof_input_buffer() + num_threads * sizeof_output_buffer();
+    auto ws = reinterpret_cast<WorkingSpace *>(raw_ws);
+    ws->input_buffer = ws + 1;
+    ws->output_buffer = reinterpret_cast<char *>(ws + 1) + sizeof(TInput) * this->m_args.n_channels;
+
+    // Fill the input buffer with an appropriate value
+    TInput fill_val = 0;
+    if (this->m_args.pool_type == PoolingType::MAX)
+    {
+      using limits = std::numeric_limits<TInput>;
+      if (limits::has_infinity)
+      {
+        fill_val = -limits::infinity();
+      }
+      else
+      {
+        fill_val = limits::min();
+      }
+    }
+
+    auto ptr = reinterpret_cast<TInput *>(ws->input_buffer);
+    auto n_channels = this->m_args.n_channels;
+    for (; n_channels; n_channels--)
+    {
+      *(ptr++) = fill_val;
+    }
   }
 
-  void execute(
-    const void *const input,
-    void *const output,
-    void *const working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
+  /* Compute a portion of the output tensor with padding. */
+  void compute_tile_padded(
+    unsigned int output_i, unsigned int output_j,
+    unsigned int channel_start, unsigned int channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    void *working_space
   ) const override
   {
-    const size_t ld_input_col = m_args.n_channels;
-    const size_t ld_input_row = ld_input_col * m_args.input_cols;
-    const size_t ld_input_batch = ld_input_row * m_args.input_rows;
-    const size_t ld_output_col = ld_input_col;
-    const size_t ld_output_row = ld_output_col * m_args.output_cols;
-    const size_t ld_output_batch = ld_output_row * m_args.output_rows;
-
-    execute(
-      input, ld_input_col, ld_input_row, ld_input_batch,
-      output, ld_output_col, ld_output_row, ld_output_batch,
-      working_space,
-      thread_id, num_threads
+    const auto kern = reinterpret_cast<const DepthfirstStrategy<TInput, TOutput> *>(
+      this->m_strat.get())->get_kernel();
+
+    // Get the working space, and some space on the stack for pointer arrays
+    auto ws = reinterpret_cast<WorkingSpace *>(working_space);
+    auto inptr_array = reinterpret_cast<const TInput **>(alloca(
+        sizeof(TInput *) * this->m_strat->get_input_rows() * this->m_strat->get_input_cols()));
+    auto outptr_array = reinterpret_cast<TOutput **>(alloca(
+        sizeof(TOutput *) * this->m_strat->get_output_rows() * this->m_strat->get_output_cols()));
+
+    // Prepare the input pointers
+    const int ii = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
+    const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+    const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+    const unsigned int end_ii = ii + this->m_strat->get_input_rows();
+    const auto input_pad_bottom = end_ii < this->m_args.input_rows ? 0 : end_ii - this->m_args.input_rows;
+
+    const int ij = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
+    const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
+    const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+    const unsigned int end_ij = ij + this->m_strat->get_input_cols();
+    const auto input_pad_right = end_ij < this->m_args.input_cols ? 0 : end_ij - this->m_args.input_cols;
+
+    fill_pointer_array<const TInput>(
+      inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
+      input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start,
+      input.ld_row, input.ld_col,
+      reinterpret_cast<const TInput *>(ws->input_buffer),
+      input_pad_top, this->m_args.input_rows - input_i,
+      input_pad_left, this->m_args.input_cols - input_j
     );
-  }
 
-  void execute(
-    const void *const input,
-    size_t ld_input_col,
-    size_t ld_input_row,
-    size_t ld_input_batch,
-    void *const output,
-    size_t ld_output_col,
-    size_t ld_output_row,
-    size_t ld_output_batch,
-    void *const working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
-  ) const override
-  {
-    execute(
-      m_args.n_batches, m_args.input_rows, m_args.input_cols,
-      m_args.n_channels,
-      input, ld_input_col, ld_input_row, ld_input_batch,
-      m_args.padding,
-      m_args.output_rows, m_args.output_cols,
-      output, ld_output_col, ld_output_row, ld_output_batch,
-      working_space,
-      thread_id, num_threads
+    // Prepare the output pointers
+    fill_pointer_array(
+      outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+      output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
+      output.ld_row, output.ld_col,
+      reinterpret_cast<TOutput *>(ws->output_buffer),
+      0, this->m_args.output_rows - output_i, // Top padding, # valid rows
+      0, this->m_args.output_cols - output_j  // Left padding, # valid columns
+    );
+
+    // Call the kernel
+    kern(
+      channel_end - channel_start, inptr_array, outptr_array,
+      this->m_args.exclude_padding,
+      input_pad_left, input_pad_top,
+      input_pad_right, input_pad_bottom
     );
   }
 
-  void execute(
-    unsigned int batches,
-    unsigned int height,
-    unsigned int width,
-    unsigned int channels,
-    const void *const _input,
-    size_t ld_input_col,
-    size_t ld_input_row,
-    size_t ld_input_batch,
-    const PaddingValues &padding,
-    unsigned int output_height,
-    unsigned int output_width,
-    void *const _output,
-    size_t ld_output_col,
-    size_t ld_output_row,
-    size_t ld_output_batch,
-    void *const _working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
+  // Compute a portion of the work with only top/bottom padding.
+  void compute_row_padded_tile_row(
+    const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
+    const unsigned int channel_start, const unsigned int channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    void *working_space
   ) const override
   {
-    ARM_COMPUTE_UNUSED(batches, ld_input_batch, ld_output_batch);
-    strategy strat(m_args.cpu_info);
-#ifdef CYCLE_PROFILING
-    arm_gemm::profiler prof;
-#endif // CYCLE_PROFILING
-
-    // Cast input and output pointers into the right types
-    const TInput *const inptr = static_cast<const TInput *>(_input);
-    TOutput *const outptr = static_cast<TOutput *>(_output);
-
-    const unsigned int roundup_output_rows = roundup(output_height, num_threads);
-    const unsigned int rows_per_thread = roundup_output_rows / num_threads;
-    const int start_out_height = static_cast<int>(thread_id * rows_per_thread);
-    const int end_out_height = std::min<int>(output_height, static_cast<int>((thread_id + 1) * rows_per_thread));
-
-    // Create an array for the input pointers
-    const TInput * _inptr_array[input_rows() * input_cols()];
-    const TInput **const inptr_array = _inptr_array;
-
-    // Create an array for the output pointers
-    TOutput * _outptr_array[strategy::out_rows() * strategy::out_cols()];
-    TOutput **const outptr_array = _outptr_array;
-
-    // Allocate portions of the working space
-    uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
-    TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space + thread_id * sizeof_output_buffer());
-    TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + num_threads * sizeof_output_buffer());
-
-    // Initialise the input buffer
-    for (unsigned int c = 0; c < channels; c++)
-    {
-      TInput &val = input_buffer[c];
+    const auto kern = reinterpret_cast<const DepthfirstStrategy<TInput, TOutput> *>(
+      this->m_strat.get())->get_kernel();
+
+    // Get the working space, and some space on the stack for pointer arrays
+    auto ws = reinterpret_cast<WorkingSpace *>(working_space);
+    auto inptr_array = reinterpret_cast<const TInput **>(alloca(
+        sizeof(TInput *) * this->m_strat->get_input_rows() * this->m_strat->get_input_cols()));
+    auto outptr_array = reinterpret_cast<TOutput **>(alloca(
+        sizeof(TOutput *) * this->m_strat->get_output_rows() * this->m_strat->get_output_cols()));
+
+    // Prepare the initial input pointers
+    const int ii = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
+    const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+    const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+    const unsigned int end_ii = ii + this->m_strat->get_input_rows();
+    const auto input_pad_bottom = end_ii < this->m_args.input_rows ? 0 : end_ii - this->m_args.input_rows;
+
+    const int ij = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
+    const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+    const auto end_oi = output_i + this->m_strat->get_output_cols();
+    const auto output_pad_bottom = end_oi < this->m_args.output_rows ? 0 : end_oi - this->m_args.output_rows;
+
+    fill_pointer_array<const TInput>(
+      inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
+      input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start,
+      input.ld_row, input.ld_col,
+      reinterpret_cast<const TInput *>(ws->input_buffer),
+      input_pad_top, this->m_args.input_rows - input_i,
+      0, this->m_args.input_cols - input_j
+    );
 
-      if (strategy::pooling_type() == PoolingType::AVERAGE)
-      {
-        val = static_cast<TInput>(0);
-      }
-      else if (strategy::pooling_type() == PoolingType::MAX)
-      {
-#if defined(__aarch64__)
-        using InputType = typename std::conditional<std::is_same<TInput, __fp16>::value, arm_compute::half, TInput>::type;
-        using limits = std::numeric_limits<InputType>;
-#else // defined(__aarch64__)
-        using limits = std::numeric_limits<TInput>;
-#endif // defined(__aarch64__)
-        if (limits::has_infinity)
-        {
-          val = -limits::infinity();
-        }
-        else
-        {
-          val = limits::min();
-        }
-      }
-    }
+    // Prepare the initial output pointers
+    fill_pointer_array(
+      outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+      output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
+      output.ld_row, output.ld_col,
+      reinterpret_cast<TOutput *>(ws->output_buffer),
+      0, this->m_args.output_rows - output_i, // Top padding, # valid rows
+      0, this->m_args.output_cols - output_j  // Left padding, # valid columns
+    );
 
-    // For each output tile, construct the requisite set of pointers and call
-    // into the kernel.
-    for (unsigned int batch = 0; batch < batches; batch++)
+    // Call the kernel
+    for (; n_tile_cols; n_tile_cols--)
     {
-      // Get batch pointers
-      const auto inptr_batch = inptr + batch * ld_input_batch;
-      const auto outptr_batch = outptr + batch * ld_output_batch;
+      kern(
+        channel_end - channel_start, inptr_array, outptr_array,
+        this->m_args.exclude_padding,
+        0, input_pad_top,
+        0, input_pad_bottom
+      );
+
+      // Progress the input and output pointer arrays
+      const auto input_col_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.pool_stride.cols;
+      for (
+        auto n = input_pad_top * this->m_strat->get_input_cols();
+        n < (this->m_strat->get_input_rows() - input_pad_bottom) * this->m_strat->get_input_cols();
+        n++
+      )
+      {
+        inptr_array[n] += input_col_stride;
+      }
 
-      for (int start_out_i = start_out_height;
-           start_out_i < end_out_height;
-           start_out_i += static_cast<int>(strategy::out_rows()))
+      const auto output_col_stride = output.ld_col * this->m_strat->get_output_cols();
+      for (
+        auto n = 0u;
+        n < (this->m_strat->get_output_rows() - output_pad_bottom) * this->m_strat->get_output_cols();
+        n++
+      )
       {
-        const int end_out_i = start_out_i + strategy::out_rows();
-        const int start_in_i = start_out_i * strategy::stride_rows() - padding.top;
-        const int end_in_i = start_in_i + input_rows();
-
-        // Compute top/bottom padding - TODO Is this right for average pooling?
-        const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
-        const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(height) - end_in_i, 0));
-        const unsigned int valid_output_rows = std::min(
-          end_out_i - start_out_i,
-          static_cast<int>(end_out_height) - start_out_i
-        );
-
-        // Fill the input pointer array with padding values
-        for (auto index = 0u; index < input_rows() * input_cols(); index++)
-        {
-          inptr_array[index] = input_buffer;
-        }
-
-        for (int start_out_j = 0, start_in_j = -padding.left;
-             start_out_j < static_cast<int>(output_width);
-             start_out_j += static_cast<int>(strategy::out_cols()),
-             start_in_j += static_cast<int>(strategy::out_cols()) * strategy::stride_cols())
-        {
-          const int end_out_j = start_out_j + strategy::out_cols();
-          const int end_in_j = start_in_j + input_cols();
-
-          // Compute left/right padding - TODO Is this right for average pooling?
-          const auto pad_left = static_cast<unsigned int>(-std::min(start_in_j, 0));
-          const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(width) - end_in_j, 0));
-
-          const unsigned int valid_output_cols = std::min(
-            end_out_j - start_out_j,
-            static_cast<int>(output_width) - start_out_j
-          );
-
-          // Construct the input pointer array - fill the array with pointers to
-          // the input buffer and then fill in the required values.
-          for (auto i = pad_top; i < input_rows() - pad_bottom; i++)
-          {
-            // Can skip over the left padding because we will have either the
-            // same or less than the previous tile.
-            unsigned int j = pad_left;
-            const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col;
-            const TInput **ptrs = inptr_array + i * input_cols() + j;
-            for (; j < input_cols() - pad_right; j++)
-            {
-              *(ptrs++) = colptr;
-              colptr += ld_input_col;
-            }
-            for (; j < input_cols(); j++)
-            {
-              *(ptrs++) = input_buffer;
-            }
-          }
-
-          // Construct the output pointer array.
-          TOutput **outptr_pos = outptr_array;
-          for (auto i = 0u; i < valid_output_rows; i++)
-          {
-            unsigned int j = 0u;
-            TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
-            for (; j < valid_output_cols; j++)
-            {
-              *(outptr_pos++) = colptr;
-               colptr += ld_output_col;
-            }
-            for (; j < strategy::out_cols(); j++)
-            {
-              *(outptr_pos++) = output_buffer;
-            }
-          }
-          for (auto i = valid_output_rows; i < strategy::out_rows(); i++)
-          {
-            for (auto j = 0u; j < strategy::out_cols(); j++)
-            {
-              *(outptr_pos++) = output_buffer;
-            }
-          }
-
-#ifdef CYCLE_PROFILING
-          // TODO Work number
-          auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::out_rows() * strategy::out_cols() * strategy::pool_rows() * strategy::pool_cols()));
-#endif
-          strat.kernel(
-            channels, inptr_array, outptr_array,
-            m_args.exclude_padding, pad_left, pad_top, pad_right, pad_bottom
-          );
-        }
+        outptr_array[n] += output_col_stride;
       }
     }
   }
+
+  public:
+  PoolingDepthfirst(const DepthfirstStrategy<TInput, TOutput> *strat,
+                    const PoolingArgs &args, const OutputStage &os = {})
+  : DepthfirstDriver<TInput, TOutput>(strat, args)
+  {
+    ARM_COMPUTE_UNUSED(os);
+  }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp
deleted file mode 100644
index 4aabd957cd..0000000000
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "pool_common.hpp"
-
-#include <stack>
-#include <vector>
-
-namespace arm_conv {
-namespace pooling {
-
-template <class strategy>
-class PoolingDepthfirstCacheOblivious : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type>
-{
-  using TInput = typename strategy::operand_type;
-  using TOutput = typename strategy::return_type;
-
-  const PoolingArgs m_args;  // Copy of arguments
-
-  constexpr static unsigned int input_rows(void)
-  {
-    return (strategy::out_rows() - 1)*strategy::stride_rows() + strategy::pool_rows();
-  }
-
-  constexpr static unsigned int input_cols(void)
-  {
-    return (strategy::out_cols() - 1)*strategy::stride_cols() + strategy::pool_cols();
-  }
-
-  size_t sizeof_input_buffer(void) const
-  {
-    return sizeof(TInput) * m_args.n_channels;
-  }
-
-  size_t sizeof_output_buffer(void) const
-  {
-    return sizeof(TOutput) * m_args.n_channels;
-  }
-
-  public:
-  PoolingDepthfirstCacheOblivious(const PoolingArgs &args) : m_args(args)
-  {
-  }
-
-  PoolingDepthfirstCacheOblivious(PoolingDepthfirstCacheOblivious &) = delete;
-  PoolingDepthfirstCacheOblivious &operator=(PoolingDepthfirstCacheOblivious &) = delete;
-
-  size_t get_working_size(void) const override
-  {
-    // We require an array of pointers for the inputs and outputs, a
-    // channel-length vector in which to dump surplus output, and a
-    // channel-length vector of padding values.
-    return sizeof_input_buffer() + sizeof_output_buffer();
-  }
-
-  void execute(
-    const void *const input,
-    void *const output,
-    void *const working_space
-  ) const override
-  {
-    const size_t ld_input_col = m_args.n_channels;
-    const size_t ld_input_row = ld_input_col * m_args.input_cols;
-    const size_t ld_input_batch = ld_input_row * m_args.input_rows;
-    const size_t ld_output_col = ld_input_col;
-    const size_t ld_output_row = ld_output_col * m_args.output_cols;
-    const size_t ld_output_batch = ld_output_row * m_args.output_rows;
-
-    execute(
-      input, ld_input_col, ld_input_row, ld_input_batch,
-      output, ld_output_col, ld_output_row, ld_output_batch,
-      working_space
-    );
-  }
-
-  void execute(
-    const void *const input,
-    size_t ld_input_col,
-    size_t ld_input_row,
-    size_t ld_input_batch,
-    void *const output,
-    size_t ld_output_col,
-    size_t ld_output_row,
-    size_t ld_output_batch,
-    void *const working_space
-  ) const override
-  {
-    execute(
-      m_args.n_batches, m_args.input_rows, m_args.input_cols,
-      m_args.n_channels,
-      input, ld_input_col, ld_input_row, ld_input_batch,
-      m_args.padding,
-      m_args.output_rows, m_args.output_cols,
-      output, ld_output_col, ld_output_row, ld_output_batch,
-      working_space
-    );
-  }
-
-  void execute(
-    unsigned int batches,
-    unsigned int input_height,
-    unsigned int input_width,
-    unsigned int channels,
-    const void *const _input,
-    size_t ld_input_col,
-    size_t ld_input_row,
-    size_t ld_input_batch,
-    const PaddingValues &padding,
-    unsigned int output_height,
-    unsigned int output_width,
-    void *const _output,
-    size_t ld_output_col,
-    size_t ld_output_row,
-    size_t ld_output_batch,
-    void *const _working_space
-  ) const override
-  {
-    strategy strat(m_args.cpu_info);
-#ifdef CYCLE_PROFILING
-    arm_gemm::profiler prof;
-#endif // CYCLE_PROFILING
-
-    // Cast input and output pointers into the right types
-    const TInput *const inptr = static_cast<const TInput *>(_input);
-    TOutput *const outptr = static_cast<TOutput *>(_output);
-
-    // Allocate portions of the working space
-    uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
-    TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
-    TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + sizeof_output_buffer());
-
-    // Fill the input buffer
-    const TInput pad_value = (m_args.pool_type == PoolingType::AVERAGE)
-                           ? static_cast<TInput>(0)
-                           : (std::numeric_limits<TInput>::has_infinity
-                              ? -std::numeric_limits<TInput>::infinity()
-                              : std::numeric_limits<TInput>::lowest());
-    for (unsigned int i = 0; i < channels; i++)
-    {
-      input_buffer[i] = pad_value;
-    }
-
-    // Keep subdividing the output plane across the longest dimension until we
-    // reach the size of the tile. Queue items for later processing. Note - we
-    // can determine the largest size of the queue a priori from the input
-    // tensor size, this would allow us to allocate memory within the working
-    // space and improve performance.
-    struct WorkItem
-    {
-      unsigned int output_i, output_j;
-      unsigned int output_height, output_width;
-
-      WorkItem(unsigned int i, unsigned int j, unsigned int height, unsigned int width)
-        : output_i(i), output_j(j), output_height(height), output_width(width) {}
-    };
-
-    auto execute = [&] (const WorkItem &item) {
-      // Create an array for the output pointers
-      TOutput * _outptr_array[strategy::out_rows() * strategy::out_cols()];
-      TOutput **const outptr_array = _outptr_array;
-
-      // Construct the output pointer array
-      {
-        const auto output_pad_right = strategy::out_rows() - item.output_width;
-        auto outptr_element = outptr_array;
-        auto outptr_row = outptr + item.output_i * ld_output_row + item.output_j * ld_output_col;
-
-        // Fill the array with pointers to the output buffer
-        for (unsigned int i = 0; i < strategy::out_rows() * strategy::out_cols(); i++)
-        {
-          outptr_array[i] = output_buffer;
-        }
-
-        // Fill in the valid portion of the array
-        for (unsigned int i = 0; i < item.output_height; i++)
-        {
-          auto outptr_col = outptr_row;
-          for (unsigned int j = 0; j < item.output_width; j++)
-          {
-            *(outptr_element++) = outptr_col;
-            outptr_col += ld_output_col;
-          }
-          outptr_element += output_pad_right;
-          outptr_row += ld_output_row;
-        }
-      }
-
-      const int start_i = item.output_i * strategy::stride_rows() - padding.top;
-      const int end_i = start_i + input_rows();
-      const unsigned int pad_top = std::max(0, 0 - start_i);
-      const unsigned int pad_bottom = std::max(0, end_i - static_cast<int>(input_height));
-
-      const int start_j = item.output_j * strategy::stride_cols() - padding.left;
-      const int end_j = start_j + input_cols();
-      const unsigned int pad_left = std::max(0, 0 - start_j);
-      const unsigned int pad_right = std::max(0, end_j - static_cast<int>(input_width));
-
-      // Create an array for the input pointers
-      const TInput * _inptr_array[input_rows() * input_cols()];
-      const TInput **const inptr_array = _inptr_array;
-      {
-        const unsigned int row_padding = pad_top + pad_bottom;
-        const unsigned int valid_rows = input_rows() - row_padding;
-
-        const unsigned int col_padding = pad_left + pad_right;
-        const unsigned int valid_cols = input_cols() - col_padding;
-
-        // Fill the array with pointers to the input buffer
-        for (unsigned int i = 0; i < input_rows() * input_cols(); i++)
-        {
-          inptr_array[i] = input_buffer;
-        }
-
-        // Compute valid initial pointer
-        auto inptr_row = inptr + std::max(start_i, 0) * ld_input_row + std::max(start_j, 0) * ld_input_col;
-
-        // Fill in the valid portion of the input array
-        auto inptr_element = inptr_array + pad_top * input_cols() + pad_left;
-        for (unsigned int i = 0; i < valid_rows; i++)
-        {
-          auto inptr_col = inptr_row;
-          for (unsigned int j = 0; j < valid_cols; j++)
-          {
-            *(inptr_element++) = inptr_col;
-            inptr_col += ld_input_col;
-          }
-
-          inptr_row += ld_input_row;
-          inptr_element += col_padding;  // Skip the padding elements
-        }
-      }
-
-      // Call the kernel
-#ifdef CYCLE_PROFILING
-      // TODO Work number
-      auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(item.output_height * item.output_width * strategy::pool_rows() * strategy::pool_cols()));
-#endif // CYCLE_PROFILING
-      strat.kernel(channels, inptr_array, outptr_array,
-                   pad_left, pad_top, pad_right, pad_bottom);
-    };
-
-    // Add the initial work item to the stack of work.
-    std::stack<WorkItem, std::vector<WorkItem>> stack;
-    stack.push(WorkItem(0, 0, output_height, output_width));
-    while (!stack.empty())
-    {
-      // Pop an item from the stack, bisect the largest dimension and either
-      // execute the resulting tiles or add them to the stack if they are too
-      // large.
-      const WorkItem item(stack.top());
-      stack.pop();
-
-      if (item.output_height <= strategy::out_rows() &&
-          item.output_width <= strategy::out_cols())
-      {
-        execute(item);
-      }
-      else
-      {
-        // Split the largest dimension, such that we get an exact number of
-        // tiles in the first partition.
-        if (item.output_height >= item.output_width)
-        {
-          const unsigned int height_in_tiles = (item.output_height + strategy::out_rows() - 1) / strategy::out_rows();
-          const unsigned int tiles_first = height_in_tiles - height_in_tiles / 2;
-
-          const unsigned int height_first = tiles_first * strategy::out_rows();
-          const unsigned int height_second = item.output_height - height_first;
-
-          stack.push(WorkItem(item.output_i + height_first, item.output_j, height_second, item.output_width));
-          stack.push(WorkItem(item.output_i, item.output_j, height_first, item.output_width));
-        }
-        else
-        {
-          const unsigned int width_in_tiles = item.output_width / strategy::out_cols();
-          const unsigned int tiles_first = width_in_tiles - width_in_tiles / 2;
-
-          const unsigned int width_first = tiles_first * strategy::out_cols();
-          const unsigned int width_second = item.output_width - width_first;
-
-          stack.push(WorkItem(item.output_i, item.output_j + width_first, item.output_height, width_second));
-          stack.push(WorkItem(item.output_i, item.output_j, item.output_height, width_first));
-        }
-      }
-    }
-  }
-};
-
-}  // namespace pooling
-}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
index 5979862ed8..ded2c75127 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,236 +24,264 @@
 
 #pragma once
 
-#include "pool_common.hpp"
+#include "depthfirst_driver.hpp"
 #include "utils.hpp"
+#if !defined(_WIN64) && !defined(__OpenBSD__)
+#include <alloca.h>
+#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
 
 namespace arm_conv {
 namespace pooling {
 
-template <class strategy>
-class PoolingDepthfirstGeneric : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type>
+template <typename TInput, typename TOutput, typename OutputStage = Nothing>
+class IGenericDepthfirstStrategy;
+
+template <typename TInput, typename TOutput>
+class IGenericDepthfirstStrategy<TInput, TOutput, Nothing>
 {
-  using TInput = typename strategy::operand_type;
-  using TOutput = typename strategy::return_type;
+  public:
+  virtual ~IGenericDepthfirstStrategy() = default;
 
-  const PoolingArgs m_args;  // Copy of arguments
+  typedef void (*KernelType)(
+    uint64_t window_cells,
+    uint64_t n_valid_cells,
+    uint64_t n_channels,
+    const TInput *const *,
+    TOutput *
+  );
 
-  unsigned int input_rows(void) const
-  {
-    return m_args.pool_window.rows;
-  }
-
-  unsigned int input_cols(void) const
-  {
-    return m_args.pool_window.cols;
-  }
+  virtual KernelType get_kernel(void) const = 0;
+};
 
+template <typename TInput, typename TOutput>
+class IGenericDepthfirstStrategy<TInput, TOutput, Requantize32>
+{
   public:
-  PoolingDepthfirstGeneric(const PoolingArgs &args) : m_args(args)
-  {
-  }
+  virtual ~IGenericDepthfirstStrategy() = default;
+
+  typedef void (*KernelType)(
+    uint64_t window_cells,
+    uint64_t n_valid_cells,
+    uint64_t n_channels,
+    const TInput *const *,
+    TOutput *,
+    const Requantize32 &
+  );
+
+  virtual KernelType get_kernel(void) const = 0;
+};
 
-  PoolingDepthfirstGeneric(PoolingDepthfirstGeneric &) = delete;
-  PoolingDepthfirstGeneric &operator=(PoolingDepthfirstGeneric &) = delete;
+template <typename TInput, typename TOutput, typename OutputStage>
+struct Invoker;
 
-  size_t sizeof_input_pointer_array(void) const
+template <typename TInput, typename TOutput>
+struct Invoker<TInput, TOutput, Nothing>
+{
+  static inline void invoke(
+    const typename IGenericDepthfirstStrategy<TInput, TOutput, Nothing>::KernelType kern,
+    uint64_t window_cells,
+    uint64_t n_valid_cells,
+    uint64_t n_channels,
+    const TInput *const *inptrs,
+    TOutput *outptr,
+    const Nothing &
+  )
   {
-    return sizeof(TInput *) * input_rows() * input_cols();
+    kern(window_cells, n_valid_cells, n_channels, inptrs, outptr);
   }
+};
 
-  size_t get_working_size(unsigned int num_threads) const override
+template <typename TInput, typename TOutput>
+struct Invoker<TInput, TOutput, Requantize32>
+{
+  static inline void invoke(
+    const typename IGenericDepthfirstStrategy<TInput, TOutput, Requantize32>::KernelType kern,
+    uint64_t window_cells,
+    uint64_t n_valid_cells,
+    uint64_t n_channels,
+    const TInput *const *inptrs,
+    TOutput *outptr,
+    const Requantize32 &qp
+  )
   {
-    return num_threads * sizeof_input_pointer_array();
+    kern(window_cells, n_valid_cells, n_channels, inptrs, outptr, qp);
   }
+};
 
-  void execute(
-    const void *const input,
-    void *const output,
-    void *const working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
-  ) const override
+template <typename TInput, typename TOutput, typename OutputStage>
+class GenericDepthfirstWrapper : public IDepthfirstStrategy
+{
+  using StratType = IGenericDepthfirstStrategy<TInput, TOutput, OutputStage>;
+
+  std::unique_ptr<const StratType> m_strat;
+  const unsigned int window_rows, window_cols;
+
+  public:
+  GenericDepthfirstWrapper(const StratType *strat, const PoolingArgs &args)
+  : m_strat(strat), window_rows(args.pool_window.rows), window_cols(args.pool_window.cols)
   {
-    const size_t ld_input_col = m_args.n_channels;
-    const size_t ld_input_row = ld_input_col * m_args.input_cols;
-    const size_t ld_input_batch = ld_input_row * m_args.input_rows;
-    const size_t ld_output_col = ld_input_col;
-    const size_t ld_output_row = ld_output_col * m_args.output_cols;
-    const size_t ld_output_batch = ld_output_row * m_args.output_rows;
-
-    execute(
-      input, ld_input_col, ld_input_row, ld_input_batch,
-      output, ld_output_col, ld_output_row, ld_output_batch,
-      working_space,
-      thread_id, num_threads
-    );
   }
 
-  void execute(
-    const void *const input,
-    size_t ld_input_col,
-    size_t ld_input_row,
-    size_t ld_input_batch,
-    void *const output,
-    size_t ld_output_col,
-    size_t ld_output_row,
-    size_t ld_output_batch,
-    void *const working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
+  unsigned int get_input_rows(void) const override { return window_rows; }
+  unsigned int get_input_cols(void) const override { return window_cols; }
+  unsigned int get_output_rows(void) const override { return 1; }
+  unsigned int get_output_cols(void) const override { return 1; }
+
+  typename StratType::KernelType get_kernel(void) const { return m_strat->get_kernel(); }
+};
+
+template <typename TInput, typename TOutput=TInput, typename OutputStage=Nothing>
+class PoolingDepthfirstGeneric : public DepthfirstDriver<TInput, TOutput>
+{
+  const OutputStage m_os;
+
+  protected:
+  size_t get_working_size_per_thread() const override { return 0; }
+  void initialise_working_space(void *) const override { /* Nothing */ }
+
+  /* Compute a portion of the output tensor with padding. */
+  void compute_tile_padded(
+    unsigned int output_i, unsigned int output_j,
+    unsigned int channel_start, unsigned int channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    void *
   ) const override
   {
-    execute(
-      m_args.n_batches, m_args.input_rows, m_args.input_cols,
-      m_args.n_channels,
-      input, ld_input_col, ld_input_row, ld_input_batch,
-      m_args.padding,
-      m_args.output_rows, m_args.output_cols,
-      output, ld_output_col, ld_output_row, ld_output_batch,
-      working_space,
-      thread_id, num_threads
+    // Determine start position and padding
+    const int start_i = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
+    const auto input_i = static_cast<unsigned int>(start_i < 0 ? 0 : start_i);
+    const auto pad_top = static_cast<unsigned int>(start_i < 0 ? -start_i : 0);
+    const int end_i = start_i + this->m_args.pool_window.rows;
+    const auto pad_bottom = static_cast<unsigned int>((unsigned int) end_i < this->m_args.input_rows ? 0 : end_i - this->m_args.input_rows);
+    const auto valid_rows = this->m_args.pool_window.rows - (pad_top + pad_bottom);
+
+    const int start_j = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
+    const auto input_j = static_cast<unsigned int>(start_j < 0 ? 0 : start_j);
+    const auto pad_left = static_cast<unsigned int>(start_j < 0 ? -start_j : 0);
+    const int end_j = start_j + this->m_args.pool_window.cols;
+    const auto pad_right = static_cast<unsigned int>((unsigned int) end_j < this->m_args.input_cols ? 0 : end_j - this->m_args.input_cols);
+    const auto valid_cols = this->m_args.pool_window.cols - (pad_left + pad_right);
+
+    // Determine the number of valid cells and prepare the pointers
+    const auto n_valid_cells = valid_rows * valid_cols;
+    auto inptrs = reinterpret_cast<const TInput **>(alloca(n_valid_cells * sizeof(TInput *)));
+    {
+      auto my_ptr = inptrs;
+      auto row_ptr = input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start;
+      for (auto i = valid_rows; i; i--)
+      {
+        auto ptr = row_ptr;
+        row_ptr += input.ld_row;
+
+        for (auto j = valid_cols; j; j--)
+        {
+          *(my_ptr++) = ptr;
+          ptr += input.ld_col;
+        }
+      }
+    }
+
+    auto outptr = output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start;
+
+    // Some padding variants include (or exclude) the padding values; we handle
+    // this by computing the extent of the padded input tensor and hence
+    // computing the total number of cells captured in the pooling window.
+    const auto bottom_padded_height = this->m_args.input_rows + this->m_args.padding.bottom;
+    const auto captured_rows = std::min<int>(end_i, bottom_padded_height) - start_i;
+    const auto right_padded_width = this->m_args.input_cols + this->m_args.padding.right;
+    const auto captured_cols = std::min<int>(end_j, right_padded_width) - start_j;
+    const auto captured_cells = captured_rows * captured_cols;
+    const auto window_cells = this->m_args.exclude_padding ? n_valid_cells : captured_cells;
+
+    // Execute the kernel
+    Invoker<TInput, TOutput, OutputStage>::invoke(
+      reinterpret_cast<const GenericDepthfirstWrapper<TInput, TOutput, OutputStage> *>(this->m_strat.get())->get_kernel(),
+      window_cells, n_valid_cells, channel_end - channel_start, inptrs, outptr, m_os
     );
   }
 
-  void execute(
-    unsigned int batches,
-    unsigned int height,
-    unsigned int width,
-    unsigned int channels,
-    const void *const _input,
-    size_t ld_input_col,
-    size_t ld_input_row,
-    size_t ld_input_batch,
-    const PaddingValues &padding,
-    unsigned int output_height,
-    unsigned int output_width,
-    void *const _output,
-    size_t ld_output_col,
-    size_t ld_output_row,
-    size_t ld_output_batch,
-    void *const _working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
+  // Compute a portion of the work with only top/bottom padding.
+  void compute_row_padded_tile_row(
+    const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
+    const unsigned int channel_start, const unsigned int channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    void *
   ) const override
   {
-    strategy strat(m_args.cpu_info);
-#ifdef CYCLE_PROFILING
-    arm_gemm::profiler prof;
-#endif // CYCLE_PROFILING
-
-    const unsigned int roundup_output_rows = roundup(output_height, num_threads);
-    const unsigned int rows_per_thread = roundup_output_rows / num_threads;
-    int start_out_height = static_cast<int>(thread_id * rows_per_thread);
-    int end_out_height = std::min<int>(output_height, static_cast<int>((thread_id + 1) * rows_per_thread));
-
-    unsigned int start_channel = 0;
-    unsigned int end_channel = channels;
-    if(output_height == 1)
+    // Determine start position and padding
+    const int start_i = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
+    const auto input_i = static_cast<unsigned int>(start_i < 0 ? 0 : start_i);
+    const auto pad_top = static_cast<unsigned int>(start_i < 0 ? -start_i : 0);
+    const int end_i = start_i + this->m_args.pool_window.rows;
+    const auto pad_bottom = static_cast<unsigned int>((unsigned int) end_i < this->m_args.input_rows ? 0 : end_i - this->m_args.input_rows);
+    const auto valid_rows = this->m_args.pool_window.rows - (pad_top + pad_bottom);
+
+    const int start_j = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
+    const auto input_j = static_cast<unsigned int>(start_j < 0 ? 0 : start_j);
+    const auto valid_cols = this->m_args.pool_window.cols;
+
+    // Determine the number of valid cells and prepare the pointers
+    const auto n_valid_cells = valid_rows * valid_cols;
+    auto inptrs = reinterpret_cast<const TInput **>(alloca(n_valid_cells * sizeof(TInput *)));
     {
-      const unsigned int channels_per_thread = roundup(channels, num_threads) / num_threads;
-      start_channel = thread_id * channels_per_thread;
-      end_channel = std::min(start_channel + channels_per_thread, channels);
-
-      // Reset start and end rows
-      start_out_height = 0;
-      end_out_height = output_height;
-    }
+      auto my_ptr = inptrs;
+      auto row_ptr = input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start;
+      for (auto i = valid_rows; i; i--)
+      {
+        auto ptr = row_ptr;
+        row_ptr += input.ld_row;
 
-    if(start_channel >= end_channel)
-    {
-        // Early exit in case of multiple threads parallelising on channels
-        return;
+        for (auto j = valid_cols; j; j--)
+        {
+          *(my_ptr++) = ptr;
+          ptr += input.ld_col;
+        }
+      }
     }
 
-    // Cast input and output pointers into the right types
-    const TInput *const inptr = static_cast<const TInput *>(_input) + start_channel;
-    TOutput *const outptr = static_cast<TOutput *>(_output) + start_channel;
+    auto outptr = output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start;
 
-    // Grab the input pointer array
-    uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
-    const TInput **const inptr_array = reinterpret_cast<const TInput **>(working_space + thread_id * sizeof_input_pointer_array());
+    // Some padding variants include (or exclude) the padding values; we handle
+    // this by computing the extent of the padded input tensor and hence
+    // computing the total number of cells captured in the pooling window.
+    const auto bottom_padded_height = this->m_args.input_rows + this->m_args.padding.bottom;
+    const auto captured_rows = std::min<int>(end_i, bottom_padded_height) - start_i;
+    const auto captured_cells = captured_rows * valid_cols;
+    const auto window_cells = this->m_args.exclude_padding ? n_valid_cells : captured_cells;
 
-    // For each output tile, construct the requisite set of pointers and call
-    // into the kernel.
-    for (unsigned int batch = 0; batch < batches; batch++)
+    for (; n_tile_cols; n_tile_cols--)
     {
-      // Get batch pointers
-      const auto inptr_batch = inptr + batch * ld_input_batch;
-      auto outptr_row = outptr + batch * ld_output_batch + start_out_height * ld_output_row;
-
-      for (int out_i = start_out_height; out_i < end_out_height; out_i++)
+      // Execute the kernel
+      Invoker<TInput, TOutput, OutputStage>::invoke(
+        reinterpret_cast<const GenericDepthfirstWrapper<TInput, TOutput, OutputStage> *>(this->m_strat.get())->get_kernel(),
+        window_cells, n_valid_cells, channel_end - channel_start, inptrs, outptr, m_os
+      );
+
+      // Update the pointers; the output strides by a column and the inputs
+      // stride by a number of columns.
+      outptr += output.ld_col;
+      for (auto n = 0u; n < n_valid_cells; n++)
       {
-        const int start_in_i = out_i * m_args.pool_stride.rows - padding.top;
-        const int end_in_i = start_in_i + m_args.pool_window.rows;
-
-        // Compute top/bottom padding
-        const auto pad_top = static_cast<unsigned int>(std::max(0 - start_in_i, 0));
-        const auto pad_bottom = static_cast<unsigned int>(std::max<int>(end_in_i - height, 0));
-        const auto valid_rows = input_rows() - pad_top - pad_bottom;
-
-        // Compute the number of pooling window rows which are contained in
-        // either the valid region of the input tensor, or the padding.
-        const auto padded_bottom = std::min<unsigned int>(
-          start_in_i + m_args.pool_window.rows, height + padding.bottom
-        );
-        const auto n_total_rows = padded_bottom - start_in_i;
-
-        auto outptr_col = outptr_row;
-        auto inptr_row = inptr_batch + (start_in_i + pad_top) * ld_input_row;
-
-        for (int out_j = 0, start_in_j = -padding.left;
-             out_j < static_cast<int>(output_width);
-             out_j++, start_in_j += m_args.pool_stride.cols)
-        {
-          const int end_in_j = start_in_j + m_args.pool_window.cols;
-
-          // Compute left/right padding
-          const auto pad_left = static_cast<unsigned int>(std::max(0 - start_in_j, 0));
-          const auto pad_right = static_cast<unsigned int>(std::max<int>(0, end_in_j - width));
-          const auto valid_cols = input_cols() - pad_left - pad_right;
-
-          // Compute the number of pooling window columns which are contained
-          // in either the valid region of the input tensor, or the padding.
-          const auto padded_right = std::min<unsigned int>(
-            start_in_j + m_args.pool_window.cols, width + padding.right
-          );
-          const auto n_total_cols = padded_right - start_in_j;
-
-          // Construct the input pointer array - fill in all valid points
-          // contiguously.
-          const TInput **ptrs = inptr_array;
-          const TInput *rowptr = inptr_row + (start_in_j + pad_left) * ld_input_col;
-          for (auto i = 0u; i < valid_rows; i++)
-          {
-            const TInput *colptr = rowptr;
-            for (auto j = 0u; j < valid_cols; j++)
-            {
-              *(ptrs++) = colptr;
-              colptr += ld_input_col;
-            }
-            rowptr += ld_input_row;
-          }
-
-          // Compute the number of valid cells
-          const auto valid_cells = valid_rows * valid_cols;
-          const auto cells_in_range = n_total_rows * n_total_cols;
-          const auto window_cells = m_args.exclude_padding ? valid_cells : cells_in_range;
-
-          // Get the output pointer for this call
-          TOutput *outptr = outptr_col;
-          outptr_col += ld_output_col;
-
-#ifdef CYCLE_PROFILING
-          // TODO Work number
-          auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::out_rows() * strategy::out_cols() * strategy::pool_rows() * strategy::pool_cols()));
-#endif // CYCLE_PROFILING
-          strat.kernel(window_cells, valid_cells, end_channel - start_channel, inptr_array, outptr);
-        }
-
-        outptr_row += ld_output_row;
+        inptrs[n] += this->m_args.pool_stride.cols * input.ld_col;
       }
     }
   }
+
+  public:
+  PoolingDepthfirstGeneric(
+    const IGenericDepthfirstStrategy<TInput, TOutput, OutputStage> *strat,
+    const PoolingArgs &args,
+    const OutputStage &os = {}
+  )
+  : DepthfirstDriver<TInput, TOutput>(
+      new GenericDepthfirstWrapper<TInput, TOutput, OutputStage>(strat, args),
+      args
+    ),
+    m_os(os)
+  {
+  }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp
deleted file mode 100644
index f3cb9a1d1f..0000000000
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "pool_common.hpp"
-#include "utils.hpp"
-
-namespace arm_conv {
-namespace pooling {
-
-template <class strategy>
-class PoolingDepthfirstGenericQuantized : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type, Requantize32>
-{
-  using TInput = typename strategy::operand_type;
-  using TOutput = typename strategy::return_type;
-
-  const PoolingArgs m_args;  // Copy of arguments
-  const Requantize32 m_requant;  // Quantization parameters
-
-  unsigned int input_rows(void) const
-  {
-    return m_args.pool_window.rows;
-  }
-
-  unsigned int input_cols(void) const
-  {
-    return m_args.pool_window.cols;
-  }
-
-  public:
-  PoolingDepthfirstGenericQuantized(const PoolingArgs &args, const Requantize32 &rq) : m_args(args), m_requant(rq)
-  {
-  }
-
-  PoolingDepthfirstGenericQuantized(PoolingDepthfirstGenericQuantized &) = delete;
-  PoolingDepthfirstGenericQuantized &operator=(PoolingDepthfirstGenericQuantized &) = delete;
-
-  size_t sizeof_input_pointer_array(void) const
-  {
-    return sizeof(TInput *) * input_rows() * input_cols();
-  }
-
-  size_t get_working_size(unsigned int num_threads) const override
-  {
-    return num_threads * sizeof_input_pointer_array();
-  }
-
-  void execute(
-    const void *const input,
-    void *const output,
-    void *const working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
-  ) const override
-  {
-    const size_t ld_input_col = m_args.n_channels;
-    const size_t ld_input_row = ld_input_col * m_args.input_cols;
-    const size_t ld_input_batch = ld_input_row * m_args.input_rows;
-    const size_t ld_output_col = ld_input_col;
-    const size_t ld_output_row = ld_output_col * m_args.output_cols;
-    const size_t ld_output_batch = ld_output_row * m_args.output_rows;
-
-    execute(
-      input, ld_input_col, ld_input_row, ld_input_batch,
-      output, ld_output_col, ld_output_row, ld_output_batch,
-      working_space,
-      thread_id, num_threads
-    );
-  }
-
-  void execute(
-    const void *const input,
-    size_t ld_input_col,
-    size_t ld_input_row,
-    size_t ld_input_batch,
-    void *const output,
-    size_t ld_output_col,
-    size_t ld_output_row,
-    size_t ld_output_batch,
-    void *const working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
-  ) const override
-  {
-    execute(
-      m_args.n_batches, m_args.input_rows, m_args.input_cols,
-      m_args.n_channels,
-      input, ld_input_col, ld_input_row, ld_input_batch,
-      m_args.padding,
-      m_args.output_rows, m_args.output_cols,
-      output, ld_output_col, ld_output_row, ld_output_batch,
-      working_space,
-      thread_id, num_threads
-    );
-  }
-
-  void execute(
-    unsigned int batches,
-    unsigned int height,
-    unsigned int width,
-    unsigned int channels,
-    const void *const _input,
-    size_t ld_input_col,
-    size_t ld_input_row,
-    size_t ld_input_batch,
-    const PaddingValues &padding,
-    unsigned int output_height,
-    unsigned int output_width,
-    void *const _output,
-    size_t ld_output_col,
-    size_t ld_output_row,
-    size_t ld_output_batch,
-    void *const _working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
-  ) const override
-  {
-    strategy strat(m_args.cpu_info);
-#ifdef CYCLE_PROFILING
-    arm_gemm::profiler prof;
-#endif // CYCLE_PROFILING
-
-    const unsigned int roundup_output_rows = roundup(output_height, num_threads);
-    const unsigned int rows_per_thread = roundup_output_rows / num_threads;
-    int start_out_height = static_cast<int>(thread_id * rows_per_thread);
-    int end_out_height = std::min<int>(output_height, static_cast<int>((thread_id + 1) * rows_per_thread));
-
-    unsigned int start_channel = 0;
-    unsigned int end_channel = channels;
-    if(output_height == 1)
-    {
-      const unsigned int channels_per_thread = roundup(channels, num_threads) / num_threads;
-      start_channel = thread_id * channels_per_thread;
-      end_channel = std::min(start_channel + channels_per_thread, channels);
-
-      // Reset start and end rows
-      start_out_height = 0;
-      end_out_height = output_height;
-    }
-
-    if(start_channel >= end_channel)
-    {
-        // Early exit in case of multiple threads parallelising on channels
-        return;
-    }
-
-    // Cast input and output pointers into the right types
-    const TInput *const inptr = static_cast<const TInput *>(_input) + start_channel;
-    TOutput *const outptr = static_cast<TOutput *>(_output) + start_channel;
-
-    // Grab the input pointer array
-    uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
-    const TInput **const inptr_array = reinterpret_cast<const TInput **>(working_space + thread_id * sizeof_input_pointer_array());
-
-    // For each output tile, construct the requisite set of pointers and call
-    // into the kernel.
-    for (unsigned int batch = 0; batch < batches; batch++)
-    {
-      // Get batch pointers
-      const auto inptr_batch = inptr + batch * ld_input_batch;
-      const auto outptr_batch = outptr + batch * ld_output_batch;
-
-      for (int out_i = start_out_height; out_i < end_out_height; out_i++)
-      {
-        const int start_in_i = out_i * m_args.pool_stride.rows - padding.top;
-        const int end_in_i = start_in_i + m_args.pool_window.rows;
-
-        // Compute top/bottom padding
-        const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
-        const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(height) - end_in_i, 0));
-
-        // Compute the number of pooling window rows which are contained in
-        // either the valid region of the input tensor, or the padding.
-        const auto padded_bottom = std::min<unsigned int>(
-          start_in_i + m_args.pool_window.rows, height + padding.bottom
-        );
-        const auto n_total_rows = padded_bottom - start_in_i;
-
-        for (int out_j = 0, start_in_j = -padding.left;
-             out_j < static_cast<int>(output_width);
-             out_j++, start_in_j += m_args.pool_stride.cols)
-        {
-          const int end_in_j = start_in_j + m_args.pool_window.cols;
-
-          // Compute left/right padding
-          const auto pad_left = static_cast<unsigned int>(-std::min(start_in_j, 0));
-          const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(width) - end_in_j, 0));
-
-          // Compute the number of pooling window columns which are contained
-          // in either the valid region of the input tensor, or the padding.
-          const auto padded_right = std::min<unsigned int>(
-            start_in_j + m_args.pool_window.cols, width + padding.right
-          );
-          const auto n_total_cols = padded_right - start_in_j;
-
-          // Construct the input pointer array - fill in all valid points
-          // contiguously.
-          const TInput **ptrs = inptr_array;
-          for (auto i = pad_top; i < input_rows() - pad_bottom; i++)
-          {
-            // Can skip over the left padding because we will have either the
-            // same or less than the previous tile.
-            unsigned int j = pad_left;
-            const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col;
-            for (; j < input_cols() - pad_right; j++)
-            {
-              *(ptrs++) = colptr;
-              colptr += ld_input_col;
-            }
-          }
-
-          // Compute the number of valid cells
-          const auto valid_rows = input_rows() - pad_top - pad_bottom;
-          const auto valid_cols = input_cols() - pad_left - pad_right;
-          const auto valid_cells = valid_rows * valid_cols;
-          const auto cells_in_range = n_total_rows * n_total_cols;
-          const auto window_cells = m_args.exclude_padding ? valid_cells : cells_in_range;
-
-          // Get the output pointer for this call
-          TOutput *outptr = outptr_batch + out_i * ld_output_row + out_j * ld_output_col;
-
-#ifdef CYCLE_PROFILING
-          // TODO Work number
-          auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long) 0);
-#endif
-          strat.kernel(window_cells, valid_cells, end_channel - start_channel, inptr_array, outptr, m_requant);
-        }
-      }
-    }
-  }
-};
-
-}  // namespace pooling
-}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp
index 094c6aa301..a7f3dd3a93 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,12 +33,18 @@
 
 #include "kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp"
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
 #include "kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
 #include "kernels/sve_fp16_nhwc_avg_generic_depthfirst.hpp"
 #include "kernels/sve_fp16_nhwc_max_generic_depthfirst.hpp"
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
 #include "kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
 #include "kernels/a64_fp16_nhwc_avg_generic_depthfirst.hpp"
@@ -48,19 +54,6 @@
 namespace arm_conv {
 namespace pooling {
 
-namespace
-{
-  template <class Strategy>
-  bool is_supported(const PoolingArgs &args, const Nothing &)
-  {
-    return ((args.pool_type == Strategy::pooling_type()) &&
-            (args.pool_window.rows == Strategy::pool_rows()) &&
-            (args.pool_window.cols == Strategy::pool_cols()) &&
-            (args.pool_stride.rows == Strategy::stride_rows()) &&
-            (args.pool_stride.cols == Strategy::stride_cols()));
-  }
-}
-
 static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
   {
     PoolingMethod::DEPTHFIRST,
@@ -70,48 +63,115 @@ static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
     },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirstGeneric<cpp_nhwc_1x1_stride_any_depthfirst<__fp16>>(args);
+      auto strat = new cpp_nhwc_1x1_stride_any_depthfirst<__fp16>(args.cpu_info);
+      return new PoolingDepthfirstGeneric<__fp16>(strat, args);
     },
   },
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sme() &&
+             is_supported<sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
+      auto strat = new sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sme() &&
+             is_supported<sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>(args, os);
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
+      auto strat = new sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_fp16_nhwc_avg_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sme() && args.pool_type == PoolingType::AVERAGE;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
+      auto strat = new sme_fp16_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<__fp16>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_fp16_nhwc_max_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sme() && args.pool_type == PoolingType::MAX;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
+      auto strat = new sme_fp16_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<__fp16>(strat, args);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst",
-    is_supported<sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>,
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sve() &&
+             is_supported<sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirst<sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<__fp16>(strat, args);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst",
-    is_supported<sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>,
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sve() &&
+             is_supported<sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>(args, os);
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirst<sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>(args);
+      auto strat = new sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<__fp16>(strat, args);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp16_nhwc_avg_generic_depthfirst",
-    [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sve() && args.pool_type == PoolingType::AVERAGE;
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirstGeneric<sve_fp16_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new sve_fp16_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<__fp16>(strat, args);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp16_nhwc_max_generic_depthfirst",
-    [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sve() && args.pool_type == PoolingType::MAX;
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirstGeneric<sve_fp16_nhwc_max_generic_depthfirst>(args);
+      auto strat = new sve_fp16_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<__fp16>(strat, args);
     },
   },
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
   {
     PoolingMethod::DEPTHFIRST,
@@ -119,7 +179,8 @@ static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
     is_supported<a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>,
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirst<a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<__fp16>(strat, args);
     },
   },
   {
@@ -128,7 +189,8 @@ static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
     is_supported<a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>,
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirst<a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>(args);
+      auto strat = new a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<__fp16>(strat, args);
     },
   },
   {
@@ -137,7 +199,8 @@ static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
     [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirstGeneric<a64_fp16_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new a64_fp16_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<__fp16>(strat, args);
     },
   },
   {
@@ -146,7 +209,8 @@ static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
     [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirstGeneric<a64_fp16_nhwc_max_generic_depthfirst>(args);
+      auto strat = new a64_fp16_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<__fp16>(strat, args);
     },
   },
 #endif  // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp
index 002115d78c..99d106583e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,12 +30,18 @@
 
 #include "kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp"
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
 #include "kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
 #include "kernels/sve_fp32_nhwc_avg_generic_depthfirst.hpp"
 #include "kernels/sve_fp32_nhwc_max_generic_depthfirst.hpp"
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
 #include "kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
 #include "kernels/a64_fp32_nhwc_avg_generic_depthfirst.hpp"
@@ -45,19 +51,6 @@
 namespace arm_conv {
 namespace pooling {
 
-namespace
-{
-  template <class Strategy>
-  bool is_supported(const PoolingArgs &args, const Nothing &)
-  {
-    return ((args.pool_type == Strategy::pooling_type()) &&
-            (args.pool_window.rows == Strategy::pool_rows()) &&
-            (args.pool_window.cols == Strategy::pool_cols()) &&
-            (args.pool_stride.rows == Strategy::stride_rows()) &&
-            (args.pool_stride.cols == Strategy::stride_cols()));
-  }
-}
-
 static const PoolingImplementation<float, float> pooling_fp32_methods[] = {
   {
     PoolingMethod::DEPTHFIRST,
@@ -67,55 +60,123 @@ static const PoolingImplementation<float, float> pooling_fp32_methods[] = {
     },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirstGeneric<cpp_nhwc_1x1_stride_any_depthfirst<float>>(args);
+      auto strat = new cpp_nhwc_1x1_stride_any_depthfirst<float>(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float, float, Nothing>(strat, args);
     },
   },
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sme() &&
+             is_supported<sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
+      auto strat = new sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sme() &&
+             is_supported<sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>(args, os);
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
+      auto strat = new sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_fp32_nhwc_avg_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sme() && args.pool_type == PoolingType::AVERAGE;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
+      auto strat = new sme_fp32_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_fp32_nhwc_max_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sme() && args.pool_type == PoolingType::MAX;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
+      auto strat = new sme_fp32_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float>(strat, args);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst",
-    is_supported<sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>,
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sve() &&
+             is_supported<sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirst<sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<float>(strat, args);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst",
-    is_supported<sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>,
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sve() &&
+             is_supported<sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>(args, os);
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirst<sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>(args);
+      auto strat = new sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<float>(strat, args);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp32_nhwc_avg_generic_depthfirst",
-    [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sve() && args.pool_type == PoolingType::AVERAGE;
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirstGeneric<sve_fp32_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new sve_fp32_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float>(strat, args);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp32_nhwc_max_generic_depthfirst",
-    [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sve() && args.pool_type == PoolingType::MAX;
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirstGeneric<sve_fp32_nhwc_max_generic_depthfirst>(args);
+      auto strat = new sve_fp32_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float>(strat, args);
     },
   },
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst",
     is_supported<a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>,
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirst<a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<float>(strat, args);
     },
   },
   {
@@ -124,7 +185,8 @@ static const PoolingImplementation<float, float> pooling_fp32_methods[] = {
     is_supported<a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>,
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirst<a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>(args);
+      auto strat = new a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<float>(strat, args);
     },
   },
   {
@@ -133,7 +195,8 @@ static const PoolingImplementation<float, float> pooling_fp32_methods[] = {
     [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirstGeneric<a64_fp32_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new a64_fp32_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float>(strat, args);
     },
   },
   {
@@ -142,7 +205,8 @@ static const PoolingImplementation<float, float> pooling_fp32_methods[] = {
     [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirstGeneric<a64_fp32_nhwc_max_generic_depthfirst>(args);
+      auto strat = new a64_fp32_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float>(strat, args);
     },
   },
 #endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_implementation.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_implementation.hpp
index 3d968b84e5..235aa1b635 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_implementation.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_implementation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,7 +39,7 @@ struct PoolingImplementation
   const char * name;
   std::function<bool(const PoolingArgs &, const OutputStage &)> is_supported;
   std::function<uint64_t(const PoolingArgs &, const OutputStage &)> cycle_estimate;
-  std::function<PoolingCommon<TInput, TOutput, OutputStage> *(const PoolingArgs &, const OutputStage &)> initialise;
+  std::function<PoolingCommon<TInput, TOutput> *(const PoolingArgs &, const OutputStage &)> initialise;
 
   bool get_is_supported(const PoolingArgs &args, const OutputStage &os) const
   {
@@ -51,12 +51,15 @@ struct PoolingImplementation
     return (cycle_estimate == nullptr) ? 0 : cycle_estimate(args, os);
   }
 
-  PoolingCommon<TInput, TOutput, OutputStage> *get_instance(const PoolingArgs &args, const OutputStage &os) const
+  PoolingCommon<TInput, TOutput> *get_instance(const PoolingArgs &args, const OutputStage &os) const
   {
     return initialise(args, os);
   }
 };
 
+/**
+ * \relates PoolingImplementation
+ */
 template <typename TInput, typename TOutput, class OutputStage = Nothing>
 const PoolingImplementation<TInput, TOutput, OutputStage> *pooling_implementation_list();
 
@@ -92,11 +95,21 @@ bool find_implementation(
 }
 
 template <typename TInput, typename TOutput, class OutputStage>
-UniquePoolingCommon<TInput, TOutput, OutputStage> pooling(const PoolingArgs &args, const OutputStage &os)
+UniquePoolingCommon<TInput, TOutput> pooling(const PoolingArgs &args, const OutputStage &os)
 {
   const PoolingImplementation<TInput, TOutput, OutputStage> *impl = nullptr;
   const bool success = find_implementation<TInput, TOutput, OutputStage>(args, os, impl);
-  return UniquePoolingCommon<TInput, TOutput, OutputStage>(success ? impl->get_instance(args, os) : nullptr);
+  return UniquePoolingCommon<TInput, TOutput>(success ? impl->get_instance(args, os) : nullptr);
+}
+
+template <class Strategy>
+bool is_supported(const PoolingArgs &args, const Nothing &)
+{
+  return ((args.pool_type == Strategy::pooling_type) &&
+          (args.pool_window.rows == Strategy::pool_rows) &&
+          (args.pool_window.cols == Strategy::pool_cols) &&
+          (args.pool_stride.rows == Strategy::stride_rows) &&
+          (args.pool_stride.cols == Strategy::stride_cols));
 }
 
 }  //  namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp
index 490fc0d863..8d08ddc43f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,13 +30,16 @@
 
 #include "kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp"
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
-#if defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_s8_nhwc_max_generic_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/sve_s8_nhwc_avg_generic_depthfirst.hpp"
-#endif  // defined(SVE2)
 #include "kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
 #include "kernels/sve_s8_nhwc_max_generic_depthfirst.hpp"
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
 #include "kernels/a64_s8_nhwc_avg_generic_depthfirst.hpp"
 #include "kernels/a64_s8_nhwc_max_generic_depthfirst.hpp"
@@ -47,19 +50,6 @@
 namespace arm_conv {
 namespace pooling {
 
-namespace
-{
-  template <class Strategy>
-  bool is_supported(const PoolingArgs &args, const Nothing &)
-  {
-    return ((args.pool_type == Strategy::pooling_type()) &&
-            (args.pool_window.rows == Strategy::pool_rows()) &&
-            (args.pool_window.cols == Strategy::pool_cols()) &&
-            (args.pool_stride.rows == Strategy::stride_rows()) &&
-            (args.pool_stride.cols == Strategy::stride_cols()));
-  }
-}
-
 static const PoolingImplementation<int8_t, int8_t> pooling_s8_methods[] = {
   {
     PoolingMethod::DEPTHFIRST,
@@ -69,48 +59,97 @@ static const PoolingImplementation<int8_t, int8_t> pooling_s8_methods[] = {
     },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
-      return new PoolingDepthfirstGeneric<cpp_nhwc_1x1_stride_any_depthfirst<int8_t>>(args);
+      auto strat = new cpp_nhwc_1x1_stride_any_depthfirst<int8_t>(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t>(strat, args);
     },
   },
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
-#if defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
   {
     PoolingMethod::DEPTHFIRST,
-    "sve_s8_nhwc_avg_generic_depthfirst",
-    [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
+    "sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sme() &&
+             is_supported<sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<int8_t>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_s8_nhwc_avg_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sme2() && args.pool_type == PoolingType::AVERAGE;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new sme_s8_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_s8_nhwc_max_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sme() && args.pool_type == PoolingType::MAX;
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
-      return new PoolingDepthfirstGeneric<sve_s8_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new sme_s8_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t>(strat, args);
     },
   },
-#endif  // defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst",
-    is_supported<sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst>,
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sve() &&
+             is_supported<sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<int8_t>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sve_s8_nhwc_avg_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sve2() && args.pool_type == PoolingType::AVERAGE;
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
-      return new PoolingDepthfirst<sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new sve_s8_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t>(strat, args);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_s8_nhwc_max_generic_depthfirst",
-    [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sve() && args.pool_type == PoolingType::MAX;
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
-      return new PoolingDepthfirstGeneric<sve_s8_nhwc_max_generic_depthfirst>(args);
+      auto strat = new sve_s8_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t>(strat, args);
     },
   },
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst",
     is_supported<a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst>,
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
-      return new PoolingDepthfirst<a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<int8_t>(strat, args);
     },
   },
   {
@@ -119,7 +158,8 @@ static const PoolingImplementation<int8_t, int8_t> pooling_s8_methods[] = {
     [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
-      return new PoolingDepthfirstGeneric<a64_s8_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new a64_s8_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t>(strat, args);
     },
   },
   {
@@ -128,7 +168,8 @@ static const PoolingImplementation<int8_t, int8_t> pooling_s8_methods[] = {
     [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
-      return new PoolingDepthfirstGeneric<a64_s8_nhwc_max_generic_depthfirst>(args);
+      auto strat = new a64_s8_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t>(strat, args);
     },
   },
 #endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp
index fd4e045035..dcb3c8f57c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,13 +25,17 @@
 #include "arm_gemm_local.hpp"
 
 #include "pooling_implementation.hpp"
-#include "pooling_depthfirst_generic_quantized.hpp"
+#include "pooling_depthfirst_generic.hpp"
 
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/sve_s8q_nhwc_avg_generic_depthfirst.hpp"
 #include "kernels/sve_s8q_nhwc_max_generic_depthfirst.hpp"
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/a64_s8q_nhwc_avg_generic_depthfirst.hpp"
 #include "kernels/a64_s8q_nhwc_max_generic_depthfirst.hpp"
 #endif  // defined(__aarch64__)
@@ -41,30 +45,60 @@
 namespace arm_conv {
 namespace pooling {
 
-static const PoolingImplementation<int8_t, int8_t, Requantize32> pooling_u8_methods[] = {
+static const PoolingImplementation<int8_t, int8_t, Requantize32> pooling_s8q_methods[] = {
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_s8q_nhwc_avg_generic_depthfirst",
+    [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+      return args.cpu_info->has_sme2() && args.pool_type == PoolingType::AVERAGE;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new sme_s8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_s8q_nhwc_max_generic_depthfirst",
+    [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+      return args.cpu_info->has_sme2() && args.pool_type == PoolingType::MAX;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new sme_s8q_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "sve_s8q_nhwc_avg_generic_depthfirst",
     [] (const PoolingArgs &args, const Requantize32 &) -> bool {
-      return args.pool_type == PoolingType::AVERAGE;
+      return args.cpu_info->has_sve2() && args.pool_type == PoolingType::AVERAGE;
     },
     nullptr,
-    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t, Requantize32> * {
-      return new PoolingDepthfirstGenericQuantized<sve_s8q_nhwc_avg_generic_depthfirst>(args, rq);
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new sve_s8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_s8q_nhwc_max_generic_depthfirst",
-    [] (const PoolingArgs &args, const Requantize32 &) -> bool { return args.pool_type == PoolingType::MAX; },
+    [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+      return args.cpu_info->has_sve2() && args.pool_type == PoolingType::MAX;
+    },
     nullptr,
-    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t, Requantize32> * {
-      return new PoolingDepthfirstGenericQuantized<sve_s8q_nhwc_max_generic_depthfirst>(args, rq);
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new sve_s8q_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
     },
   },
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "a64_s8q_nhwc_avg_generic_depthfirst",
@@ -72,8 +106,9 @@ static const PoolingImplementation<int8_t, int8_t, Requantize32> pooling_u8_meth
       return args.pool_type == PoolingType::AVERAGE;
     },
     nullptr,
-    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t, Requantize32> * {
-      return new PoolingDepthfirstGenericQuantized<a64_s8q_nhwc_avg_generic_depthfirst>(args, rq);
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new a64_s8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
     },
   },
   {
@@ -81,8 +116,9 @@ static const PoolingImplementation<int8_t, int8_t, Requantize32> pooling_u8_meth
     "a64_s8q_nhwc_max_generic_depthfirst",
     [] (const PoolingArgs &args, const Requantize32 &) -> bool { return args.pool_type == PoolingType::MAX; },
     nullptr,
-    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t, Requantize32> * {
-      return new PoolingDepthfirstGenericQuantized<a64_s8q_nhwc_max_generic_depthfirst>(args, rq);
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new a64_s8q_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
     },
   },
 #endif  // defined(__aarch64__)
@@ -92,10 +128,10 @@ static const PoolingImplementation<int8_t, int8_t, Requantize32> pooling_u8_meth
 template <>
 const PoolingImplementation<int8_t, int8_t, Requantize32> *pooling_implementation_list()
 {
-  return pooling_u8_methods;
+  return pooling_s8q_methods;
 }
 
-template UniquePoolingCommon<int8_t, int8_t, Requantize32> pooling(const PoolingArgs &, const Requantize32 &);
+template UniquePoolingCommon<int8_t, int8_t> pooling(const PoolingArgs &, const Requantize32 &);
 
 }  //  namespace pooling
 }  //  namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp
index 052354922e..ee5a79b4ff 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,13 +30,16 @@
 
 #include "kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp"
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
-#if defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_u8_nhwc_max_generic_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/sve_u8_nhwc_avg_generic_depthfirst.hpp"
-#endif  // defined(SVE2)
 #include "kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
 #include "kernels/sve_u8_nhwc_max_generic_depthfirst.hpp"
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
 #include "kernels/a64_u8_nhwc_avg_generic_depthfirst.hpp"
 #include "kernels/a64_u8_nhwc_max_generic_depthfirst.hpp"
@@ -47,19 +50,6 @@
 namespace arm_conv {
 namespace pooling {
 
-namespace
-{
-  template <class Strategy>
-  bool is_supported(const PoolingArgs &args, const Nothing &)
-  {
-    return ((args.pool_type == Strategy::pooling_type()) &&
-            (args.pool_window.rows == Strategy::pool_rows()) &&
-            (args.pool_window.cols == Strategy::pool_cols()) &&
-            (args.pool_stride.rows == Strategy::stride_rows()) &&
-            (args.pool_stride.cols == Strategy::stride_cols()));
-  }
-}
-
 static const PoolingImplementation<uint8_t, uint8_t> pooling_u8_methods[] = {
   {
     PoolingMethod::DEPTHFIRST,
@@ -69,15 +59,28 @@ static const PoolingImplementation<uint8_t, uint8_t> pooling_u8_methods[] = {
     },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
-      return new PoolingDepthfirstGeneric<cpp_nhwc_1x1_stride_any_depthfirst<uint8_t>>(args);
+      auto strat = new cpp_nhwc_1x1_stride_any_depthfirst<uint8_t>(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
     },
   },
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
-#if defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
   {
     PoolingMethod::DEPTHFIRST,
-    "sve_u8_nhwc_avg_generic_depthfirst",
+    "sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sme() &&
+             is_supported<sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<uint8_t>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_u8_nhwc_avg_generic_depthfirst",
     [] (const PoolingArgs &args, const Nothing &) -> bool {
       // This kernel can only be used when there is either no padding, or we don't care
       // about the value of the padding. Otherwise, we would need to pass in the zero-point
@@ -85,40 +88,82 @@ static const PoolingImplementation<uint8_t, uint8_t> pooling_u8_methods[] = {
       return (args.exclude_padding ||
               (args.padding.top == 0 && args.padding.bottom == 0 &&
                args.padding.left == 0 && args.padding.right == 0)
-              ) && args.pool_type == PoolingType::AVERAGE;
+              ) && args.pool_type == PoolingType::AVERAGE &&
+             args.cpu_info->has_sme2();
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new sme_u8_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_u8_nhwc_max_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sme() && args.pool_type == PoolingType::MAX;
     },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
-      return new PoolingDepthfirstGeneric<sve_u8_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new sme_u8_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
     },
   },
-#endif  // defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst",
-    is_supported<sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst>,
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sve() &&
+             is_supported<sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<uint8_t>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sve_u8_nhwc_avg_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      // This kernel can only be used when there is either no padding, or we don't care
+      // about the value of the padding. Otherwise, we would need to pass in the zero-point
+      // for the quantization regime.
+      return (args.exclude_padding ||
+              (args.padding.top == 0 && args.padding.bottom == 0 &&
+               args.padding.left == 0 && args.padding.right == 0)
+              ) && args.pool_type == PoolingType::AVERAGE &&
+             args.cpu_info->has_sve2();
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
-      return new PoolingDepthfirst<sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new sve_u8_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_u8_nhwc_max_generic_depthfirst",
-    [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sve() && args.pool_type == PoolingType::MAX;
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
-      return new PoolingDepthfirstGeneric<sve_u8_nhwc_max_generic_depthfirst>(args);
+      auto strat = new sve_u8_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
     },
   },
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst",
     is_supported<a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst>,
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
-      return new PoolingDepthfirst<a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<uint8_t>(strat, args);
     },
   },
   {
@@ -135,7 +180,8 @@ static const PoolingImplementation<uint8_t, uint8_t> pooling_u8_methods[] = {
     },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
-      return new PoolingDepthfirstGeneric<a64_u8_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new a64_u8_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
     },
   },
   {
@@ -144,7 +190,8 @@ static const PoolingImplementation<uint8_t, uint8_t> pooling_u8_methods[] = {
     [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
-      return new PoolingDepthfirstGeneric<a64_u8_nhwc_max_generic_depthfirst>(args);
+      auto strat = new a64_u8_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
     },
   },
 #endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp
index 41303fb418..cd1b02889c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,13 +25,17 @@
 #include "arm_gemm_local.hpp"
 
 #include "pooling_implementation.hpp"
-#include "pooling_depthfirst_generic_quantized.hpp"
+#include "pooling_depthfirst_generic.hpp"
 
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/sve_u8q_nhwc_avg_generic_depthfirst.hpp"
 #include "kernels/sve_u8q_nhwc_max_generic_depthfirst.hpp"
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/a64_u8q_nhwc_avg_generic_depthfirst.hpp"
 #include "kernels/a64_u8q_nhwc_max_generic_depthfirst.hpp"
 #endif  // defined(__aarch64__)
@@ -41,30 +45,60 @@
 namespace arm_conv {
 namespace pooling {
 
-static const PoolingImplementation<uint8_t, uint8_t, Requantize32> pooling_u8_methods[] = {
+static const PoolingImplementation<uint8_t, uint8_t, Requantize32> pooling_u8q_methods[] = {
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_u8q_nhwc_avg_generic_depthfirst",
+    [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+      return args.cpu_info->has_sme2() && args.pool_type == PoolingType::AVERAGE;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new sme_u8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_u8q_nhwc_max_generic_depthfirst",
+    [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+      return args.cpu_info->has_sme2() && args.pool_type == PoolingType::MAX;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new sme_u8q_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "sve_u8q_nhwc_avg_generic_depthfirst",
     [] (const PoolingArgs &args, const Requantize32 &) -> bool {
-      return args.pool_type == PoolingType::AVERAGE;
+      return args.cpu_info->has_sve2() && args.pool_type == PoolingType::AVERAGE;
     },
     nullptr,
-    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t, Requantize32> * {
-      return new PoolingDepthfirstGenericQuantized<sve_u8q_nhwc_avg_generic_depthfirst>(args, rq);
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new sve_u8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_u8q_nhwc_max_generic_depthfirst",
-    [] (const PoolingArgs &args, const Requantize32 &) -> bool { return args.pool_type == PoolingType::MAX; },
+    [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+      return args.cpu_info->has_sve2() && args.pool_type == PoolingType::MAX;
+    },
     nullptr,
-    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t, Requantize32> * {
-      return new PoolingDepthfirstGenericQuantized<sve_u8q_nhwc_max_generic_depthfirst>(args, rq);
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new sve_u8q_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
     },
   },
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "a64_u8q_nhwc_avg_generic_depthfirst",
@@ -72,8 +106,9 @@ static const PoolingImplementation<uint8_t, uint8_t, Requantize32> pooling_u8_me
       return args.pool_type == PoolingType::AVERAGE;
     },
     nullptr,
-    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t, Requantize32> * {
-      return new PoolingDepthfirstGenericQuantized<a64_u8q_nhwc_avg_generic_depthfirst>(args, rq);
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new a64_u8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
     },
   },
   {
@@ -81,8 +116,9 @@ static const PoolingImplementation<uint8_t, uint8_t, Requantize32> pooling_u8_me
     "a64_u8q_nhwc_max_generic_depthfirst",
     [] (const PoolingArgs &args, const Requantize32 &) -> bool { return args.pool_type == PoolingType::MAX; },
     nullptr,
-    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t, Requantize32> * {
-      return new PoolingDepthfirstGenericQuantized<a64_u8q_nhwc_max_generic_depthfirst>(args, rq);
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new a64_u8q_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
     },
   },
 #endif  // defined(__aarch64__)
@@ -92,10 +128,10 @@ static const PoolingImplementation<uint8_t, uint8_t, Requantize32> pooling_u8_me
 template <>
 const PoolingImplementation<uint8_t, uint8_t, Requantize32> *pooling_implementation_list()
 {
-  return pooling_u8_methods;
+  return pooling_u8q_methods;
 }
 
-template UniquePoolingCommon<uint8_t, uint8_t, Requantize32> pooling(const PoolingArgs &, const Requantize32 &);
+template UniquePoolingCommon<uint8_t, uint8_t> pooling(const PoolingArgs &, const Requantize32 &);
 
 }  //  namespace pooling
 }  //  namespace arm_conv