COMPMID-791: Generic Depthwise Convolution Layer NEON QASYMM8

Change-Id: I33cf54e68f6c097ac58b6f16c3f9a720978f09cd Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117289 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
author: Georgios Pinitas <georgios.pinitas@arm.com> 2018-01-22 16:29:17 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:45:00 +0000
commit: d05dce46a14a7b67f322328ecd95bf96bdd30bae (patch)
tree: 6e001f539969a1a669241a72e78ff5a62998a984 /arm_compute/core/NEON/kernels
parent: 5d9d019b2c7ca3dc59bfbb44b3169ee5cd71dc79 (diff)
download: ComputeLibrary-d05dce46a14a7b67f322328ecd95bf96bdd30bae.tar.gz
4 files changed, 70 insertions, 17 deletions
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h
index 8d59ba3248..ca10bfaab2 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h
@@ -55,7 +55,7 @@ public:
     /** Set the input and output of the kernel.
      *
      * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F32
+     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8, F32
      * @param[out] output      The output tensor. First 3 lower dimensions represent a transform of each 3D input,
      *                         while every dimension above 3 represents a batch. Data types supported: Same as @p input
      * @param[in]  kernel_dims The kernel dimensions (width and height).
@@ -68,11 +68,25 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    const ITensor *_input;
-    ITensor       *_output;
-    Size2D         _kernel_dims;
-    PadStrideInfo  _conv_info;
-    bool           _has_bias;
+    /** Template function to run the im2col used for the depthwise convolution layer case
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T>
+    void run_generic(const Window &window);
+    /** Common signature for all the specialised depthwise im2col functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using DepthwiseIm2ColFunctionPtr = void (NEDepthwiseIm2ColKernel::*)(const Window &window);
+
+private:
+    DepthwiseIm2ColFunctionPtr _func;
+    const ITensor             *_input;
+    ITensor                   *_output;
+    Size2D                     _kernel_dims;
+    PadStrideInfo              _conv_info;
+    bool                       _has_bias;
 };
 } // arm_compute
 #endif /*__ARM_COMPUTE_NEDEPTHWISEIM2COLKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h
index 19000905b0..458cbd7812 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h
@@ -56,7 +56,7 @@ public:
     NEDepthwiseVectorToTensorKernel &operator=(NEDepthwiseVectorToTensorKernel &&) = default;
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input  The input vector to convert. Data type supported: F32.
+     * @param[in]  input  The input vector to convert. Data type supported: QASYMM8/S32/F32.
      * @param[out] output The output tensor. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: same as @p input.
      * @param[in]  conv_w The converted tensor's width.
      * @param[in]  conv_h The converted tensor's height.
@@ -67,8 +67,22 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    const ITensor *_input;
-    ITensor       *_output;
+    /** Template function to run the vector to tensor reshape used for the depthwise convolution layer case
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T>
+    void vector_to_tensor(const Window &window);
+    /** Common signature for all the specialised depthwise vector to tensor functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using DepthwiseVectorToTensorFunctionPtr = void (NEDepthwiseVectorToTensorKernel::*)(const Window &window);
+
+private:
+    DepthwiseVectorToTensorFunctionPtr _func;
+    const ITensor                     *_input;
+    ITensor                           *_output;
     std::pair<size_t, size_t> _conv_dims;
 };
 } // arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h
index 4d23b8bd65..d00e8a46ed 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h
@@ -53,7 +53,7 @@ public:
     NEDepthwiseWeightsReshapeKernel &operator=(NEDepthwiseWeightsReshapeKernel &&) = default;
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input  The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: F32.
+     * @param[in]  input  The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: QASYMM8, F32.
      * @param[out] output The output tensor. Data type supported: same as @p input.
      * @param[in]  biases (Optional) The input biases to add. Shape [IFM]. Data type supported: same as @p input.
      */
@@ -63,9 +63,13 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    const ITensor *_input;
-    ITensor       *_output;
-    const ITensor *_biases;
+    using DepthwiseWeightsReshapeFunction = void(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window);
+
+private:
+    DepthwiseWeightsReshapeFunction *_func;
+    const ITensor                   *_input;
+    ITensor                         *_output;
+    const ITensor                   *_biases;
 };
 } // arm_compute
 #endif /*__ARM_COMPUTE_NEDEPTHWISEWEIGHTSRESHAPEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
index 5ea83901f4..95fe916a3c 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
@@ -49,7 +49,7 @@ public:
     NEGEMMMatrixVectorMultiplyKernel &operator=(NEGEMMMatrixVectorMultiplyKernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input0 First Input tensor. Data types supported: F16/F32
+     * @param[in]  input0 First Input tensor. Data types supported: QASYMM8/F32
      * @param[in]  input1 Second Input tensor. Data types supported: same as @p input.
      * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
      */
@@ -57,11 +57,32 @@ public:
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
 
 private:
-    const ITensor *_input0;
-    const ITensor *_input1;
-    ITensor       *_output;
+    /** Template function to run the matrix vector multiplication
+     *
+     * @tparam I0 Input 0 type
+     * @tparam I1 Input 1 type
+     * @tparam O  Output type
+     *
+     * @param[in] window_in  Input region. (Must be a valid region of the window returned by window()).
+     * @param[in] window_w   Weights region. (Must be a valid region of the window returned by window()).
+     * @param[in] window_out Output region.(Must be a valid region of the window returned by window()).
+     */
+    template <typename I0, typename I1, typename O>
+    void matrix_vector_multiply(const Window &window_in, const Window &window_w, const Window &window_out);
+    /** Common signature for all the specialised matrix vector multiplication functions */
+    using GEMMMatrixVectorMultiplyFunctionPtr = void (NEGEMMMatrixVectorMultiplyKernel::*)(const Window &window_in,
+                                                                                           const Window &window_w,
+                                                                                           const Window &window_out);
+
+private:
+    GEMMMatrixVectorMultiplyFunctionPtr _func;
+    const ITensor                      *_input0;
+    const ITensor                      *_input1;
+    ITensor                            *_output;
+    BorderSize                          _border_size;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_*/
author	Georgios Pinitas <georgios.pinitas@arm.com>	2018-01-22 16:29:17 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:45:00 +0000
commit	d05dce46a14a7b67f322328ecd95bf96bdd30bae (patch)
tree	6e001f539969a1a669241a72e78ff5a62998a984 /arm_compute/core/NEON/kernels
parent	5d9d019b2c7ca3dc59bfbb44b3169ee5cd71dc79 (diff)
download	ComputeLibrary-d05dce46a14a7b67f322328ecd95bf96bdd30bae.tar.gz