From 72219330fd85b1271e714d4ba894d6d8e26340c9 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Tue, 5 Jun 2018 14:56:06 +0100
Subject: COMPMID-1145: (API) Introduce prepare() stage (NEON/CL/GLES)

Change-Id: I5b46764f9c3154ec3e3b9c951cc9e6dfbcb81dfb
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/134255
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
---
 arm_compute/runtime/NEON/AssemblyHelper.h          | 52 +++++++++++++++-------
 .../runtime/NEON/functions/NEConvolutionLayer.h    |  1 +
 .../runtime/NEON/functions/NEDeconvolutionLayer.h  |  2 +
 .../NEON/functions/NEDepthwiseConvolutionLayer.h   |  3 +-
 .../NEDepthwiseSeparableConvolutionLayer.h         |  3 +-
 .../runtime/NEON/functions/NEFullyConnectedLayer.h |  7 +--
 arm_compute/runtime/NEON/functions/NEGEMM.h        | 15 +++++--
 .../NEON/functions/NEGEMMConvolutionLayer.h        |  2 +
 .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h  | 12 ++++-
 .../NEON/functions/NELocallyConnectedLayer.h       |  3 +-
 .../NEON/functions/NEWinogradConvolutionLayer.h    |  3 +-
 11 files changed, 77 insertions(+), 26 deletions(-)

(limited to 'arm_compute/runtime/NEON')
diff --git a/arm_compute/runtime/NEON/AssemblyHelper.h b/arm_compute/runtime/NEON/AssemblyHelper.h
index 3aa43ec96e..c4ba1a584e 100644
--- a/arm_compute/runtime/NEON/AssemblyHelper.h
+++ b/arm_compute/runtime/NEON/AssemblyHelper.h
@@ -51,7 +51,7 @@ public:
     using TypeResult = TypeOutput;
     /** Default constructor. */
     AssemblyKernelGlue()
-        : _gemm_kernel_asm(nullptr), _optimised_kernel(nullptr), _a(nullptr), _b(nullptr), _d(nullptr), _workspace(nullptr), _pretranspose(nullptr)
+        : _gemm_kernel_asm(nullptr), _optimised_kernel(nullptr), _a(nullptr), _b(nullptr), _d(nullptr), _workspace(nullptr), _pretranspose(nullptr), _is_prepared(false)
     {
     }
     /** Assembly Gemm */
@@ -76,6 +76,31 @@ public:
     ITensor *_workspace;
     /** Pre-transpose tensor */
     ITensor *_pretranspose;
+    /** Prepared flag */
+    bool _is_prepared;
+
+    /** Runs a preparation step, usually for pre-transposing matrix b */
+    void prepare()
+    {
+        // Pretranspose B if required
+        if(_gemm_kernel_asm->B_pretranspose_required())
+        {
+            const int  ldb            = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
+            const auto in1_ptr        = reinterpret_cast<const TypeInput *>(_b->buffer());
+            const int  multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
+
+            // Forcing 128-byte alignment (required by 32-bit kernels)
+            const unsigned int alignment   = 128;
+            void              *raw_ptr     = reinterpret_cast<void *>(_pretranspose->buffer());
+            size_t             space       = _pretranspose->info()->total_size();
+            void              *aligned_ptr = support::cpp11::align(alignment, _gemm_kernel_asm->get_B_pretransposed_array_size(), raw_ptr, space);
+            ARM_COMPUTE_ERROR_ON(_pretranspose == nullptr || _pretranspose->buffer() == nullptr);
+            _gemm_kernel_asm->pretranspose_B_array(aligned_ptr, in1_ptr, ldb, multi_stride_b);
+            _b->mark_as_unused();
+        }
+
+        _is_prepared = true;
+    }
 
     /** Configures the arrays pointers and strides in the assembly kernel and executes the assembly kernel.
      *  The call to set_arrays is needed to deal with the input sizes containing batches (dims > 2)
@@ -102,28 +127,25 @@ public:
         const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer());
         auto       out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer());
 
-        // Set workspace if needed
+        // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
         if(_workspace != nullptr)
         {
             _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace->buffer()));
+            const unsigned int window_size = _gemm_kernel_asm->get_window_size();
+            unsigned int       num_threads = NEScheduler::get().num_threads();
+            if(window_size < num_threads)
+            {
+                num_threads = window_size;
+                _gemm_kernel_asm->set_nthreads(num_threads);
+            }
         }
 
+        // Prepare assembly kernel
+        prepare();
+
         // Set gemm parameters
         _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, ldd, batch_stride_d, multi_stride_d);
 
-        // Pretranspose B if required
-        if(_gemm_kernel_asm->B_pretranspose_required())
-        {
-            // Forcing 128-byte alignment (required by 32-bit kernels)
-            const unsigned int alignment   = 128;
-            void              *raw_ptr     = reinterpret_cast<void *>(_pretranspose->buffer());
-            size_t             space       = _pretranspose->info()->total_size();
-            void              *aligned_ptr = support::cpp11::align(alignment, _gemm_kernel_asm->get_B_pretransposed_array_size(), raw_ptr, space);
-            ARM_COMPUTE_ERROR_ON(_pretranspose == nullptr || _pretranspose->buffer() == nullptr);
-            _gemm_kernel_asm->pretranspose_B_array(aligned_ptr, in1_ptr, ldb, multi_stride_b);
-            _b->mark_as_unused();
-        }
-
         // Schedule assembly kernel
         NEScheduler::get().schedule(_optimised_kernel.get(), Window::DimX);
     }
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index ff41f0c985..e143814a4e 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -112,6 +112,7 @@ public:
                                                     const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     std::shared_ptr<IMemoryManager> _memory_manager;
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index 66c6d427ba..3e527168c1 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -108,6 +108,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     MemoryGroup        _memory_group;
@@ -117,6 +118,7 @@ private:
     ITensor           *_input;
     PadStrideInfo      _info;
     std::pair<unsigned int, unsigned int> _inner_border;
+    bool _is_prepared;
 };
 } // arm_compute
 #endif /* __ARM_COMPUTE_NEDECONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index b80fb7f2c8..aa4cace7c2 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -122,6 +122,7 @@ public:
 
     // Inherited methods overriden:
     void run() override;
+    void prepare() override;
 
 private:
     NEDepthwiseIm2ColKernel                   _im2col_kernel;
@@ -135,7 +136,7 @@ private:
     Tensor                                    _weights_reshaped;
     Tensor                                    _v2mm_output;
     Tensor                                    _output_reshaped;
-    bool                                      _is_first_run;
+    bool                                      _is_prepared;
     bool                                      _is_quantized;
     const ITensor                            *_original_weights;
 };
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h
index 0562c07515..99e93ccece 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,6 +70,7 @@ public:
 
     // Inherited methods overriden:
     void run() override;
+    void prepare() override;
 
 private:
     NEDepthwiseConvolutionLayer _depthwise_conv;
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index 071eecc3f7..2739f5ebef 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -127,22 +127,23 @@ public:
 
     //Inherited methods override
     void run() override;
+    void prepare() override;
 
 private:
     MemoryGroup                         _memory_group;
     NEIm2ColKernel                      _im2col_kernel;
-    NEFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
+    NEFullyConnectedLayerReshapeWeights _reshape_weights_function;
     NEGEMMInterleave4x4Kernel           _interleave4x4_kernel;
     NEGEMMMatrixMultiplyKernel          _mm_kernel;
     NEGEMMMatrixAccumulateBiasesKernel  _accumulate_biases_kernel;
     Tensor                              _im2col_output;
     Tensor                              _interleave4x4_output;
     Tensor                              _reshape_weights_output;
-    bool                                _are_weights_reshaped;
+    const ITensor                      *_original_weights;
     bool                                _is_batched_fc_layer;
     bool                                _linearize_input;
     bool                                _accumulate_biases;
-    const ITensor                      *_original_weights;
+    bool                                _is_prepared;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index e2263c2307..5d108b2c14 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -53,7 +53,14 @@ class NEGEMM : public IFunction
 public:
     /** Constructor */
     NEGEMM(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMM(const NEGEMM &) = delete;
+    /** Default move constructor */
+    NEGEMM(NEGEMM &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMM &operator=(const NEGEMM &) = delete;
+    /** Default move assignment operator */
+    NEGEMM &operator=(NEGEMM &&) = default;
     /** Initialise the kernel's inputs, output
      *
      * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
@@ -72,6 +79,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     MemoryGroup                _memory_group;
@@ -84,10 +92,11 @@ private:
     Tensor                     _tmp_b;
     Tensor                     _workspace;
     Tensor                     _B_pretransposed;
+    const ITensor             *_original_b;
     bool                       _run_vector_matrix_multiplication;
     bool                       _run_addition;
-    bool                       _is_first_run;
     bool                       _reshape_b_only_on_first_run;
+    bool                       _is_prepared;
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEGEMM_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index d64fd9e771..7075becf75 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -153,6 +153,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     /** Configures the appropriate matrix multiply routine
@@ -197,6 +198,7 @@ private:
     bool       _is_interleaved;
     bool       _is_activationlayer_enabled;
     bool       _skip_im2col;
+    bool       _is_prepared;
 };
 }
 #endif /* __ARM_COMPUTE_NECONVOLUTIONGEMMLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index adcddb8263..f32eb3c757 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -56,6 +56,14 @@ class NEGEMMLowpMatrixMultiplyCore : public IFunction
 public:
     /** Constructor */
     NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpMatrixMultiplyCore(const NEGEMMLowpMatrixMultiplyCore &) = delete;
+    /** Default move constructor */
+    NEGEMMLowpMatrixMultiplyCore(NEGEMMLowpMatrixMultiplyCore &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpMatrixMultiplyCore &operator=(const NEGEMMLowpMatrixMultiplyCore &) = delete;
+    /** Default move assignment operator */
+    NEGEMMLowpMatrixMultiplyCore &operator=(NEGEMMLowpMatrixMultiplyCore &&) = default;
     /** Initialise the kernel's inputs, output
      *
      * @note GEMM_LOWP:  low precision GEMM kernel
@@ -86,6 +94,7 @@ public:
 
     // Inherited methods overridden
     void run() override;
+    void prepare() override;
 
 private:
     MemoryGroup                        _memory_group;
@@ -103,12 +112,13 @@ private:
     Tensor                             _tmp_b;
     Tensor                             _workspace;
     Tensor                             _B_pretranspose;
+    const ITensor                     *_original_b;
     int32_t                            _a_offset;
     int32_t                            _b_offset;
     bool                               _run_vector_matrix_multiplication;
     bool                               _dot_product_path;
-    bool                               _is_first_run;
     bool                               _reshape_b_only_on_first_run;
+    bool                               _is_prepared;
 };
 }
 #endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
index 18cd27414e..7d1f124bb3 100644
--- a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
@@ -90,6 +90,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     MemoryGroup                            _memory_group;
@@ -100,7 +101,7 @@ private:
     Tensor                                 _input_im2col_reshaped;
     Tensor                                 _weights_reshaped;
     Tensor                                 _gemm_output;
-    bool                                   _is_first_run;
+    bool                                   _is_prepared;
     const ITensor                         *_original_weights;
 };
 }
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index 55921f78f3..c1260977c0 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
@@ -74,6 +74,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer
      *
@@ -122,7 +123,7 @@ private:
     const ITensor *_input;
     const ITensor *_weights;
     ITensor       *_output;
-    bool           _reshaped_kernel;
+    bool           _is_prepared;
     bool           _is_activationlayer_enabled;
 };
 }
-- 
cgit v1.2.1