1 files changed, 461 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp b/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp
new file mode 100644
index 0000000000..9805fd354f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Depthwise kernel drivers commonly require a per-thread blob of working space
+ * in which to store parameters required by the depthwise implementations. The
+ * composition of this working space varies with the driver, kernel, and data
+ * types -- but the tasks of requesting sufficient space, allocating buffer
+ * space, and performing initialisation of the working space are common.
+ *
+ * The classes in this file consist of a number of working space "Elements"
+ * (which are logical units of functionality) and a Workspace type which allows
+ * for compile time composition of elements into a single working space type.
+ *
+ * Creating a workspace
+ * ====================
+ *
+ * A new workspace type can be created by combining Elements as an argument to
+ * the Workspace class. For instance:
+ *
+ *   Workspace<
+ *     depthwise_depthfirst::InputArrayElement<float>,
+ *     InputBufferElement<float>,
+ *     OutputArrayElement<float>
+ *   >
+ *
+ * Creates a new Workspace consisting of the given elements. The workspace type
+ * contained within this class (`Workspace<...>::WorkspaceType`) is equivalent to:
+ *
+ *   struct WorkspaceType
+ *   {
+ *     const float **inptr_array;  // From InputArrayElement<float>
+ *     float *input_buffer;  // From InputBufferElement<float>
+ *     float **outptr_array;  // From OutputArrayElement<float>
+ *     float *output_buffer;  // From OutputArrayElement<float>
+ *   };
+ *
+ * Calling `Workspace<...>::get_sizeof_workspace(...)` will return the amount
+ * of space required to store the above struct and the elements contained
+ * within it. Once this space has been allocated, the workspace can be
+ * initialised by calling `Workspace<...>::initialise` with a pointer to the
+ * buffer and the same arguments. This will place a struct of type
+ * `Workspace<...>::WorkspaceType` at the start of the buffer, and share the
+ * remaining space between the specified elements. As this is all done at
+ * compile time, later code can access elements from the `WorkspaceType` by
+ * name.
+ *
+ * Writing a new element
+ * =====================
+ *
+ * Each Element must provide:
+ *  - A struct called "Workspace" containing the variables contained within
+ *    this portion of the workspace.
+ *  - A static method called `get_element_size` which returns the amount of
+ *    buffer space required by this element of the workspace (NOT including the
+ *    size of the Workspace struct). For example, an element which stores a
+ *    vector of pointers will return the amount of space required top store the
+ *    vector.
+ *  - A static method called `initialise` which accepts a pointer to a struct
+ *    which will be composed of the Element's `Workspace` struct (along with
+ *    other elements), a pointer to the start of the buffer allocated for this
+ *    portion of the workspace, and arguments to be used to initialise the
+ *    workspace. The Element should consume as much of the buffer as it
+ *    requires, initialise the Workspace, and then return the pointer to the
+ *    next free byte of the buffer.
+ *
+ * See the below elements for an example of how this should work.
+ */
+
+#pragma once
+
+#include "depthwise.hpp"
+#include "depthfirst_driver.hpp"
+#include "utils.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+namespace {  // anonymous because we expect this to appear in several compilation units
+
+/* Arguments to use to size and initialise a workspace.
+ */
+template <class StratType, class OutputStage=Nothing>
+struct WorkspaceArgs
+{
+  const StratType *strategy;
+  const DepthwiseArgs &depthwise_args;
+  const OutputStage &output_stage;
+
+  WorkspaceArgs(const StratType *strat, const DepthwiseArgs &dwargs, const OutputStage &os = {})
+  : strategy(strat), depthwise_args(dwargs), output_stage(os)
+  {
+  }
+};
+
+
+/* Sometimes we use templated structs to fill in workspace types, the Empty
+ * element can be useful for when a blank element is required for some sets of
+ * parameters.
+ */
+struct EmptyElement
+{
+  struct Workspace {};
+
+  template <class StratType, class OutputStage>
+  static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &) { return 0; }
+
+  template <class WorkspaceType, class StratType, class OutputStage>
+  static void *initialise(WorkspaceType *, void *buffer, const WorkspaceArgs<StratType, OutputStage> &)
+  {
+    return buffer;
+  }
+};
+
+
+/* Store fused activations for a kernel.
+ *
+ * Activations are set based on the DepthwiseArgs.
+ */
+template <typename T, class OutputStage=Nothing>
+class ActivationsElement
+{
+  public:
+  struct Workspace
+  {
+    T activation_min, activation_max;
+  };
+
+  template <typename StratType>
+  static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &)
+  {
+    return 0;
+  }
+
+  template <class WorkspaceType, class StratType>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
+  {
+    ws->activation_min = static_cast<T>(-std::numeric_limits<float>::infinity());
+    ws->activation_max = static_cast<T>(std::numeric_limits<float>::infinity());
+
+    switch (args.depthwise_args.activation.type)
+    {
+      case arm_gemm::Activation::Type::BoundedReLU:
+        ws->activation_max = static_cast<T>(args.depthwise_args.activation.param1);
+        // Fall through
+      case arm_gemm::Activation::Type::ReLU:
+        ws->activation_min = static_cast<T>(0);
+        break;
+      default:
+        break;
+    }
+
+    return buffer;
+  }
+};
+
+/* Activation clamps are contained within `arm_gemm::Requantize32`, so if the
+ * output stage is one of these we substitute in an empty workspace element.
+ */
+template <typename T>
+class ActivationsElement<T, arm_gemm::Requantize32> : public EmptyElement
+{
+};
+
+
+/* Get the value with which to fill an input buffer. This defaults to `0`
+ * (which we return as a `char` since it gets used by `memset`).
+ */
+template <typename OutputStage>
+char get_input_buffer_fill_value(const OutputStage &)
+{
+  return 0;
+}
+
+/* In the case of kernels operating on quantized data, we need to fill the
+ * input buffer with the zero offset of the input tensor.
+ */
+template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp) __attribute__ ((unused));
+template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp)
+{
+  return qp.a_offset;
+}
+
+
+/* Container for a vector of padding values which can be safely consumed by the
+ * depthwise kernel. The padding values are initialised to either `0` or the
+ * zero offset of the input tensor (if quantized).
+ */
+template <typename T>
+class InputBufferElement
+{
+  public:
+  struct Workspace
+  {
+    T *input_buffer;
+  };
+
+  template <typename StratType, typename OutputStage>
+  static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &args)
+  {
+    return sizeof(T) * args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+  }
+
+  template <class WorkspaceType, typename StratType, typename OutputStage>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
+  {
+    ws->input_buffer = reinterpret_cast<T*>(buffer);
+    memset(ws->input_buffer, get_input_buffer_fill_value(args.output_stage), get_element_size(args));
+    return reinterpret_cast<char *>(buffer) + get_element_size(args);
+  }
+};
+
+
+/* Container for an array of output pointers, and a buffer which can be used as
+ * a destination for unnecessary writes.
+ */
+template <typename T>
+class OutputArrayElement
+{
+  public:
+  struct Workspace
+  {
+    T **outptr_array;
+    T *output_buffer;
+  };
+
+  template <typename OutputStage>
+  static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    return sizeof_outptr_array(args) + sizeof_output_buffer(args);
+  }
+
+  template <class WorkspaceType, typename OutputStage>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    char *buffer_bytes = reinterpret_cast<char *>(buffer);
+
+    ws->outptr_array = reinterpret_cast<T **>(buffer_bytes);
+    buffer_bytes += sizeof_outptr_array(args);
+
+    ws->output_buffer = reinterpret_cast<T *>(buffer_bytes);
+    buffer_bytes += sizeof_output_buffer(args);
+
+    return buffer_bytes;
+  }
+
+  protected:
+  template <typename OutputStage>
+  static size_t sizeof_outptr_array(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    return sizeof(T **) * args.strategy->get_output_rows() * args.strategy->get_output_cols();
+  }
+
+  template <typename OutputStage>
+  static size_t sizeof_output_buffer(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    return sizeof(T) * args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+  }
+};
+
+
+/* Intermediate array to store results of premultiplication.
+ * Used as input to the kernel instead of the original input array.
+ */
+template <typename T>
+class IntermediateBufferElement
+{
+public:
+    struct Workspace
+    {
+        T *intermediate_buffer;
+    };
+
+    template <typename StratType, typename OutputStage>
+    static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &args)
+    {
+      auto cols = args.depthwise_args.input_cols + args.depthwise_args.kernel_cols;
+      auto rows = args.strategy->get_input_rows() + args.depthwise_args.kernel_rows;
+      auto channels = args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+      return sizeof(T) * cols * rows * channels;
+    }
+
+    template <class WorkspaceType, typename StratType, typename OutputStage>
+    static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
+    {
+      ws->intermediate_buffer = reinterpret_cast<T*>(buffer);
+      return reinterpret_cast<char *>(buffer) + get_element_size(args);
+    }
+};
+
+
+/* Container for requantization parameters.
+ *
+ * This removes the distinction between per-layer and per-channel
+ * requantization parameters by providing a vector of requantization parameters
+ * regardless of whether per-layer or per-channel is selected.
+ */
+class RequantizationParametersElement
+{
+  public:
+  struct Workspace
+  {
+    const int32_t *bias, *requant_muls, *requant_shifts;
+  };
+
+  template <typename StratType>
+  static size_t get_element_size(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+  {
+    return sizeof_bias(args) + sizeof_requant_muls(args) + sizeof_requant_shifts(args);
+  }
+
+  template <typename WorkspaceType, typename StratType>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+  {
+    const auto n_output_channels = args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+    char *buffer_bytes = reinterpret_cast<char *>(buffer);
+
+    ws->bias = args.output_stage.bias;
+    ws->requant_muls = args.output_stage.per_channel_muls;
+    ws->requant_shifts = args.output_stage.per_channel_right_shifts;
+
+    if (ws->bias == nullptr)
+    {
+      ws->bias = reinterpret_cast<const int32_t *>(buffer_bytes);
+      memset(buffer_bytes, 0, sizeof_bias(args));
+      buffer_bytes += sizeof_bias(args);
+    }
+
+    if (ws->requant_muls == nullptr)
+    {
+      ws->requant_muls = reinterpret_cast<const int32_t *>(buffer_bytes);
+      auto muls = reinterpret_cast<int32_t *>(buffer_bytes);
+      buffer_bytes += sizeof_requant_muls(args);
+
+      for (auto n = 0u; n < n_output_channels; n++)
+      {
+        muls[n] = args.output_stage.per_layer_mul;
+      }
+    }
+
+    if (ws->requant_shifts == nullptr)
+    {
+      ws->requant_shifts = reinterpret_cast<int32_t *>(buffer_bytes);
+      auto shifts = reinterpret_cast<int32_t *>(buffer_bytes);
+      buffer_bytes += sizeof_requant_shifts(args);
+
+      for (auto n = 0u; n < n_output_channels; n++)
+      {
+        shifts[n] = args.output_stage.per_layer_right_shift;
+      }
+    }
+
+    return buffer_bytes;
+  }
+
+  protected:
+  template <typename StratType>
+  static size_t sizeof_bias(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+  {
+    return args.output_stage.bias != nullptr ?
+      0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
+  }
+
+  template <typename StratType>
+  static size_t sizeof_requant_muls(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+  {
+    return args.output_stage.per_channel_muls != nullptr ?
+      0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
+  }
+
+  template <typename StratType>
+  static size_t sizeof_requant_shifts(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+  {
+    return args.output_stage.per_channel_right_shifts != nullptr ?
+      0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
+  }
+};
+
+
+template <typename ...Elements>
+class Workspace;
+
+template <typename Element, typename ...Elements>
+class Workspace<Element, Elements...>
+{
+  public:
+  struct WorkspaceType : Element::Workspace, Workspace<Elements...>::WorkspaceType
+  {
+  };
+
+  template <class S, class T>
+  static void initialise(void *buffer, const WorkspaceArgs<S, T> &args)
+  {
+    // Allocate sufficient space for the struct, then initialise each of the
+    // elements in turn.
+    auto ws = reinterpret_cast<WorkspaceType *>(buffer);
+    initialise_elements(ws, ws + 1, args);
+  }
+
+  template <class S, class T=Nothing>
+  static size_t get_sizeof_workspace(const WorkspaceArgs<S, T> &args)
+  {
+    return sizeof(WorkspaceType) + get_element_sizes(args);
+  }
+
+  template <class S, class T>
+  static inline size_t get_element_sizes(const WorkspaceArgs<S, T> &args)
+  {
+    return Element::get_element_size(args) + Workspace<Elements...>::get_element_sizes(args);
+  }
+
+  template <class WorkspaceType, class S, class T>
+  static void initialise_elements(WorkspaceType *ws, void *buffer, const WorkspaceArgs<S, T> &args)
+  {
+    buffer = Element::initialise(ws, buffer, args);  // Get the next buffer
+    Workspace<Elements...>::initialise_elements(ws, buffer, args);
+  }
+};
+
+template <>
+class Workspace<>
+{
+  public:
+  struct WorkspaceType
+  {
+  };
+
+  template <class S, class T>
+  static inline size_t get_element_sizes(const WorkspaceArgs<S, T> &)
+  {
+    return 0;
+  }
+
+  template <class WorkspaceType, class S, class T>
+  static void initialise_elements(WorkspaceType *, void *, const WorkspaceArgs<S, T> &)
+  {
+  }
+};
+
+}  // namespace {anonymous}
+}  // namespace depthwise
+}  // namespace arm_conv