Depthwise channel pre-multiplication

Resolves: COMPMID-6337 Change-Id: Ie9097b3f56e8071426c621386a5988bd7f7e8ef2 Signed-off-by: Michael Tyler <michael.tyler@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9852 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Michael Tyler <michael.tyler@arm.com> 2023-06-30 11:26:05 +0100
committer: michael.tyler <michael.tyler@arm.com> 2023-07-04 14:34:58 +0000
commit: 8deee9bd9b9137c256c23b86be11dbf0466f3aa8 (patch)
tree: ac80b3bdd992552b65e306b77f061484da0591ca /src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
parent: 19844f605f5e5b71d05164711dee13f8652adafe (diff)
download: ComputeLibrary-8deee9bd9b9137c256c23b86be11dbf0466f3aa8.tar.gz
1 files changed, 152 insertions, 44 deletions
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
index 2620b48e17..7b00c9a7af 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
@@ -115,7 +115,7 @@ class DepthwiseDepthfirstStrategy<TInput, TWeight, TOutput, int32_t>
   {
     return interleaves::PackingArguments(
       this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
-      false, sizeof(int32_t),  // Don't pack the bias
+      false, sizeof(int32_t), this->uses_premultiply(),  // Don't pack the bias
       this->get_vl_type(), sizeof(int32_t), this->get_accumulator_depth_vl(),
       [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
       { return this->get_kernel_packing_point(idx, x, y); }
@@ -162,6 +162,64 @@ class DepthwiseDepthfirstCommon : public DepthfirstDriver<TInput, TWeight, TOutp
   inline OutputStage &get_output_stage(void) { return m_os; }
   inline const OutputStage &get_output_stage(void) const { return m_os; }
 
+  bool uses_intermediate_array() const
+  {
+    return this->m_args.channel_multiplier != 1 && this->uses_premultiply();
+  }
+
+  virtual void fill_inptr_array(const DepthwiseArgs &args,
+    const TensorSpec<const TInput *> &input,
+    const TInput **inptr_array, TInput *input_buffer,
+    const unsigned int input_i, const unsigned int input_j,
+    const unsigned int input_pad_top, const unsigned int input_pad_left) const = 0;
+
+  void initialise_inptr_array(const DepthwiseArgs &args,
+      unsigned int output_channel_start, unsigned int output_channel_end,
+      const TensorSpec<const TInput *> &input,
+      const TInput **inptr_array, TInput *input_buffer, TInput *intermediate_buffer,
+      const unsigned int input_i, const unsigned int input_j,
+      const unsigned int input_pad_top, const unsigned int input_pad_left,
+      Tile<TInput> &multiplied_input
+  ) const
+  {
+    // Compute the input pointer array
+    const auto input_channel_start = output_channel_start / args.channel_multiplier;
+
+    const auto last_valid_row = std::min(input_pad_top + args.input_rows - input_i, this->m_strat->get_input_rows());
+    const auto last_valid_col = std::min(input_pad_left + args.input_cols - input_j, this->m_strat->get_input_cols());
+
+    const auto tile_rows = last_valid_row - input_pad_top;
+    const auto tile_cols = last_valid_col - input_pad_left;
+
+    const auto tile_channels = output_channel_end - output_channel_start;
+
+    TensorSpec<const TInput *> tile_tensor(0, 0, 0);
+    if (this->uses_intermediate_array()) {
+      multiplied_input = Tile<TInput>(intermediate_buffer, tile_rows, tile_cols, tile_channels);
+      multiplied_input.load_from(input.base, input.ld_row, input.ld_col,
+                                 args.input_rows, args.input_cols,
+                                 input_i, input_j, args.channel_multiplier);
+
+      tile_tensor = TensorSpec<const TInput *>(
+        multiplied_input.array,
+        tile_cols * tile_channels, tile_channels
+      );
+    } else {
+      tile_tensor = TensorSpec<const TInput *>(
+        input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start,
+        input.ld_row, input.ld_col
+      );
+    }
+
+    fill_inptr_array(args,
+      tile_tensor,
+      inptr_array, input_buffer,
+      input_i, input_j,
+      input_pad_top,
+      input_pad_left
+    );
+  }
+
   public:
   DepthwiseDepthfirstCommon(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os)
   : DepthfirstDriver<TInput, TWeight, TOutput>(strat, args), m_os(os)
@@ -321,6 +379,7 @@ class DepthwiseDepthfirst
     OutputArrayElement<TOutput>,
     depthwise_depthfirst::InputArrayElement<TInput>,
     InputBufferElement<TInput>,
+    IntermediateBufferElement<TInput>,
     typename depthwise_depthfirst::WorkspaceFinalElement<TAccum, OutputStage>::Element
   >;
   using WorkingSpace = typename WorkspaceManager::WorkspaceType;
@@ -347,25 +406,46 @@ class DepthwiseDepthfirst
     depthwise_depthfirst::stash_bias(this->get_output_stage(), biases);
   }
 
-  size_t get_working_size_per_thread(const unsigned int n_input_channels) const override
+  size_t get_working_size_per_thread() const override
   {
     DepthwiseArgs args(this->m_args);
-    args.input_channels = n_input_channels;
     return WorkspaceManager::get_sizeof_workspace(
       WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage())
     );
   }
 
-  void initialise_working_space(void *buffer, unsigned int n_input_channels) const override
+  void initialise_working_space(void *buffer) const override
   {
     DepthwiseArgs args(this->m_args);
-    args.input_channels = n_input_channels;
     WorkspaceManager::initialise(
       buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage())
     );
   }
 
+  virtual bool supports_direct_padding() const override
+  {
+    using Invoker = depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>;
+    return Invoker::supports_direct_kernel && this->uses_intermediate_array();
+  }
+
   protected:
+
+  void fill_inptr_array(const DepthwiseArgs &args,
+    const TensorSpec<const TInput *> &input,
+    const TInput **inptr_array, TInput *input_buffer,
+    const unsigned int input_i, const unsigned int input_j,
+    const unsigned int input_pad_top, const unsigned int input_pad_left) const override
+  {
+    fill_pointer_array<const TInput>(
+      inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
+      input.base,
+      input.ld_row, input.ld_col,
+      input_buffer,
+      input_pad_top, args.input_rows - input_i,
+      input_pad_left, args.input_cols - input_j
+    );
+  }
+
   void compute_tile_padded(
     const DepthwiseArgs &args,
     unsigned int output_i, unsigned int output_j,
@@ -380,8 +460,6 @@ class DepthwiseDepthfirst
     auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
 
     // Compute the input pointer array
-    const auto input_channel_start = output_channel_start / args.channel_multiplier;
-
     const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
     const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
     const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
@@ -390,14 +468,10 @@ class DepthwiseDepthfirst
     const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
     const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
 
-    fill_pointer_array<const TInput>(
-      ws->inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
-      input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start,
-      input.ld_row, input.ld_col,
-      ws->input_buffer,
-      input_pad_top, args.input_rows - input_i,
-      input_pad_left, args.input_cols - input_j
-    );
+    Tile<TInput> multiplied_input;
+    this->initialise_inptr_array(args, output_channel_start, output_channel_end, input,
+      ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
+      input_i, input_j, input_pad_top, input_pad_left, multiplied_input);
 
     // Compute the output pointer array
     fill_pointer_array(
@@ -432,12 +506,11 @@ class DepthwiseDepthfirst
     const auto os = this->get_output_stage();
 
     // Compute top and bottom padding; hence fill in the initial pointer arrays.
-    const auto input_channel_start = output_channel_start / args.channel_multiplier;
     const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
     const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
 
     const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
-    const auto input_j = output_j * args.stride_cols - args.padding.left;
+    auto input_j = output_j * args.stride_cols - args.padding.left;
 
     // Valid input rows is the smallest of the input rows that aren't padding for this tile, and the number of rows
     // available.
@@ -447,14 +520,10 @@ class DepthwiseDepthfirst
     const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * args.stride_cols;
     const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols();
 
-    fill_pointer_array<const TInput>(
-      ws->inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
-      input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start,
-      input.ld_row, input.ld_col,
-      ws->input_buffer,
-      input_pad_top, args.input_rows - input_i,
-      0, args.input_cols - input_j  // No left padding
-    );
+    Tile<TInput> multiplied_input;
+    this->initialise_inptr_array(args, output_channel_start, output_channel_end, input,
+      ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
+      input_i, input_j, input_pad_top, 0, multiplied_input);
 
     fill_pointer_array(
       ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
@@ -473,16 +542,25 @@ class DepthwiseDepthfirst
       );
 
       // Update all unpadded pointers
-      {
-        auto ptr = ws->inptr_array + strat->get_input_cols() * input_pad_top;
-        for (auto n = input_pad_top; n < (valid_input_rows + input_pad_top); n++)
+      if (this->uses_intermediate_array()) {
+        input_j += input_point_stride / input.ld_col;
+        multiplied_input.load_from(input.base,
+          input.ld_row, input.ld_col,
+          args.input_rows, args.input_cols,
+          input_i, input_j, args.channel_multiplier);
+      } else {
         {
-          for (auto m = 0u; m < strat->get_input_cols(); m++)
+          auto ptr = ws->inptr_array + strat->get_input_cols() * input_pad_top;
+          for (auto n = input_pad_top; n < (valid_input_rows + input_pad_top); n++)
           {
-            *(ptr++) += input_point_stride;
+            for (auto m = 0u; m < strat->get_input_cols(); m++)
+            {
+              *(ptr++) += input_point_stride;
+            }
           }
         }
       }
+
       {
         auto ptr = ws->outptr_array;
         for (auto n = 0u; n < valid_output_rows * strat->get_output_cols(); n++)
@@ -511,6 +589,13 @@ class DepthwiseDepthfirst
 
     if (Invoker::supports_direct_kernel)
     {
+      PaddingValues tile_padding = {
+              args.kernel_cols / 2,
+              args.kernel_rows / 2,
+              args.kernel_cols / 2,
+              args.kernel_rows / 2
+      };
+
       // If the direct kernel is supported, then use it.
       // Compute the base pointers we'll use in the tile.
       auto outptr = output.base + output_channel_start + output_i * output.ld_row + output_j * output.ld_col;
@@ -518,11 +603,31 @@ class DepthwiseDepthfirst
       const int start_input_j = output_j * args.stride_cols - args.padding.left;
       auto inptr = input.base + output_channel_start + start_input_i * input.ld_row + start_input_j * input.ld_col;
 
+      auto ld_row = input.ld_row;
+      auto ld_col = input.ld_col;
+
+      const auto tile_rows = this->m_strat->get_output_rows() * args.stride_rows * n_tile_rows + tile_padding.top + tile_padding.bottom;
+      const auto tile_cols = this->m_strat->get_output_cols() * args.stride_cols * n_tile_cols + tile_padding.left + tile_padding.right;
+      const auto tile_channels = output_channel_end - output_channel_start;
+
+      Tile<TInput> multiplied_input;
+      if (this->uses_intermediate_array()) {
+        multiplied_input = Tile<TInput>(ws->intermediate_buffer, tile_rows, tile_cols, tile_channels);
+        multiplied_input.load_from(input.base,
+          input.ld_row, input.ld_col,
+          args.input_rows, args.input_cols,
+          start_input_i, start_input_j, args.channel_multiplier);
+
+        ld_row = tile_cols * tile_channels;
+        ld_col = tile_channels;
+        inptr = multiplied_input.array;
+      }
+
       // Execute the kernel
       Invoker::direct(
         strat, ws, os,
         n_tile_rows, n_tile_cols,
-        inptr, input.ld_row, input.ld_col,
+        inptr, ld_row, ld_col,
         outptr, output.ld_row, output.ld_col,
         parameters, output_channel_end - output_channel_start
       );
@@ -531,7 +636,6 @@ class DepthwiseDepthfirst
     {
       // Otherwise, we repeatedly call the padded kernel but use our knowledge
       // of the tensor structure to avoid recomputing the pointer array.
-      const auto input_channel_start = output_channel_start / args.channel_multiplier;
 
       const auto n_input_pointers = this->m_strat->get_input_rows() * this->m_strat->get_input_cols();
       const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * args.stride_cols;
@@ -543,16 +647,12 @@ class DepthwiseDepthfirst
       for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
       {
         const int input_i = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
-        const int input_j = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
+        int input_j = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
 
-        fill_pointer_array<const TInput>(
-          ws->inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
-          input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start,
-          input.ld_row, input.ld_col,
-          ws->input_buffer,
-          0, args.input_rows,
-          0, args.input_cols
-        );
+        Tile<TInput> multiplied_input;
+        this->initialise_inptr_array(args, output_channel_start, output_channel_end, input,
+          ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
+          input_i, input_j, 0, 0, multiplied_input);
 
         // Compute the output pointer array
         fill_pointer_array(
@@ -572,10 +672,18 @@ class DepthwiseDepthfirst
           );
 
           // Progress the pointers
-          for (auto i = 0u; i < n_input_pointers; i++)
-          {
-            ws->inptr_array[i] += input_point_stride;
+          if (this->uses_intermediate_array()) {
+            input_j += input_point_stride / input.ld_col;
+            multiplied_input.load_from(input.base,
+              input.ld_row, input.ld_col,
+              args.input_rows, args.input_cols, input_i, input_j, args.channel_multiplier);
+          } else {
+            for (auto i = 0u; i < n_input_pointers; i++)
+            {
+              ws->inptr_array[i] += input_point_stride;
+            }
           }
+
           for (auto i = 0u; i < n_output_pointers; i++)
           {
             ws->outptr_array[i] += output_point_stride;
author	Michael Tyler <michael.tyler@arm.com>	2023-06-30 11:26:05 +0100
committer	michael.tyler <michael.tyler@arm.com>	2023-07-04 14:34:58 +0000
commit	8deee9bd9b9137c256c23b86be11dbf0466f3aa8 (patch)
tree	ac80b3bdd992552b65e306b77f061484da0591ca /src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
parent	19844f605f5e5b71d05164711dee13f8652adafe (diff)
download	ComputeLibrary-8deee9bd9b9137c256c23b86be11dbf0466f3aa8.tar.gz