Update Neon™ pooling kernel

- Reduce duplication and simplify overall structure. - Improve multi-threaded performance by sharing more data in lower-level caches. Partially Resolves: COMPMID-5054 Signed-off-by: Ramy Elgammal<ramy.elgammal@arm.com> Change-Id: I5f4dc50913401d5c1cbfc10b866fae9490cbc4d7 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7404 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Andrew Mundy Reviewed-by: Sheri Zhang <sheri.zhang@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: ramelg01 <ramy.elgammal@arm.com> 2022-04-08 03:52:28 +0100
committer: Ramy Elgammal <ramy.elgammal@arm.com> 2022-04-25 15:35:59 +0000
commit: c827e99fc46521f43719b0c2d1b6f05d66abf68c (patch)
tree: 31df1002673b2a4c4aae66608ad85b1ad6517050 /src/core/NEON/kernels/assembly
parent: 0a3948394e7e77344201b8732e9c20fcb5fa9a38 (diff)
download: ComputeLibrary-c827e99fc46521f43719b0c2d1b6f05d66abf68c.tar.gz
2 files changed, 130 insertions, 18 deletions
diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp
index b6a0a0abed..599e18ac59 100644
--- a/src/core/NEON/kernels/assembly/pool_common.hpp
+++ b/src/core/NEON/kernels/assembly/pool_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,9 @@
  */
 
 #pragma once
-
-#include "common.hpp"
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
 
 namespace arm_conv
 {
@@ -53,6 +54,11 @@ struct PoolingStride
     unsigned int rows, cols;
 };
 
+struct PaddingValues
+{
+    unsigned int left, top, right, bottom;
+};
+
 class IPoolingCommon
 {
 public:
@@ -60,6 +66,7 @@ public:
 
     // Determine the amount of working space required.
     virtual size_t get_working_size(unsigned int num_threads) const = 0;
+    virtual size_t get_working_size(unsigned int num_threads, unsigned int n_channels) const = 0;
 
     // Execute pooling over the specified area of memory.
     virtual void execute(
@@ -103,14 +110,5 @@ public:
         unsigned int num_threads) const = 0;
 };
 
-struct Nothing
-{
-};
-
-template <typename TInput, typename TOutput, class OutputStage = Nothing>
-class PoolingCommon : public IPoolingCommon
-{
-};
-
 } // namespace pooling
 } // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/pooling.hpp b/src/core/NEON/kernels/assembly/pooling.hpp
index 2325bd08ca..1b47853eaf 100644
--- a/src/core/NEON/kernels/assembly/pooling.hpp
+++ b/src/core/NEON/kernels/assembly/pooling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,8 +27,6 @@
 #include "arm_gemm_local.hpp"
 #include "pool_common.hpp"
 
-#include <memory>
-
 namespace arm_conv
 {
 namespace pooling
@@ -89,6 +87,10 @@ struct PoolingArgs
     }
 };
 
+struct Nothing
+{
+};
+
 struct Requantize32
 {
     int32_t input_offset  = 0;
@@ -106,12 +108,124 @@ struct Requantize32
     }
 };
 
-template <typename TInput, typename TOutput, class OutputStage = Nothing>
-using UniquePoolingCommon = std::unique_ptr<PoolingCommon<TInput, TOutput, OutputStage>>;
+template <typename TInput, typename TOutput>
+class PoolingCommon : public IPoolingCommon
+{
+protected:
+    const PoolingArgs m_args;
+
+public:
+    PoolingCommon(const PoolingArgs &args)
+        : m_args(args)
+    {
+    }
+    PoolingCommon(PoolingCommon &) = delete;
+    PoolingCommon &operator=(PoolingCommon &) = delete;
+
+    size_t get_working_size(unsigned int, unsigned int) const override = 0;
+    size_t get_working_size(unsigned int n_threads) const override
+    {
+        return this->get_working_size(n_threads, m_args.n_channels);
+    }
+
+    // Execute pooling over the specified area of memory.
+    void execute(
+        const void *const input,
+        void *const       output,
+        void             *working_space,
+        unsigned int      thread_id,
+        unsigned int      num_threads) const override
+    {
+        this->execute(
+            input,
+            m_args.n_channels,
+            m_args.n_channels * m_args.input_cols,
+            m_args.n_channels * m_args.input_cols * m_args.input_rows,
+            output,
+            m_args.n_channels,
+            m_args.n_channels * m_args.output_cols,
+            m_args.n_channels * m_args.output_cols * m_args.output_rows,
+            working_space,
+            thread_id, num_threads);
+    }
+
+    void execute(
+        const void *const input,
+        size_t            ld_input_col,
+        size_t            ld_input_row,
+        size_t            ld_input_batch,
+        void *const       output,
+        size_t            ld_output_col,
+        size_t            ld_output_row,
+        size_t            ld_output_batch,
+        void             *working_space,
+        unsigned int      thread_id,
+        unsigned int      num_threads) const override
+    {
+        this->execute(
+            m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels,
+            input, ld_input_col, ld_input_row, ld_input_batch,
+            m_args.padding, m_args.output_rows, m_args.output_cols,
+            output, ld_output_col, ld_output_row, ld_output_batch,
+            working_space, thread_id, num_threads);
+    }
+
+    void execute(
+        unsigned int         batches,
+        unsigned int         height,
+        unsigned int         width,
+        unsigned int         channels,
+        const void *const    input,
+        size_t               ld_input_col,
+        size_t               ld_input_row,
+        size_t               ld_input_batch,
+        const PaddingValues &padding,
+        unsigned int         output_height,
+        unsigned int         output_width,
+        void *const          output,
+        size_t               ld_output_col,
+        size_t               ld_output_row,
+        size_t               ld_output_batch,
+        void                *working_space,
+        unsigned int         thread_id,
+        unsigned int         num_threads) const override
+    {
+        this->execute_internal(
+            batches, height, width, channels, padding,
+            input, ld_input_col, ld_input_row, ld_input_batch,
+            output_height, output_width,
+            output, ld_output_col, ld_output_row, ld_output_batch,
+            working_space, thread_id, num_threads);
+    }
+
+protected:
+    virtual void execute_internal(
+        unsigned int batches,
+        unsigned int height,
+        unsigned int width,
+        unsigned int channels,
+        const PaddingValues &,
+        const void *const input,
+        size_t            ld_input_col,
+        size_t            ld_input_row,
+        size_t            ld_input_batch,
+        unsigned int      output_height,
+        unsigned int      output_width,
+        void *const       output,
+        size_t            ld_output_col,
+        size_t            ld_output_row,
+        size_t            ld_output_batch,
+        void             *working_space,
+        unsigned int      thread_id,
+        unsigned int      num_threads) const = 0;
+};
+
+template <typename TInput, typename TOutput>
+using UniquePoolingCommon = std::unique_ptr<PoolingCommon<TInput, TOutput>>;
 
 // Get a pooling engine
 template <typename TInput, typename TOutput = TInput, class OutputStage = Nothing>
-UniquePoolingCommon<TInput, TOutput, OutputStage> pooling(const PoolingArgs &, const OutputStage & = {});
+UniquePoolingCommon<TInput, TOutput> pooling(const PoolingArgs &, const OutputStage & = {});
 
 } // namespace pooling
 } // namespace arm_conv
author	ramelg01 <ramy.elgammal@arm.com>	2022-04-08 03:52:28 +0100
committer	Ramy Elgammal <ramy.elgammal@arm.com>	2022-04-25 15:35:59 +0000
commit	c827e99fc46521f43719b0c2d1b6f05d66abf68c (patch)
tree	31df1002673b2a4c4aae66608ad85b1ad6517050 /src/core/NEON/kernels/assembly
parent	0a3948394e7e77344201b8732e9c20fcb5fa9a38 (diff)
download	ComputeLibrary-c827e99fc46521f43719b0c2d1b6f05d66abf68c.tar.gz