COMPMID-791: Generic Depthwise Convolution Layer NEON QASYMM8

Change-Id: I33cf54e68f6c097ac58b6f16c3f9a720978f09cd Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117289 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
author: Georgios Pinitas <georgios.pinitas@arm.com> 2018-01-22 16:29:17 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:45:00 +0000
commit: d05dce46a14a7b67f322328ecd95bf96bdd30bae (patch)
tree: 6e001f539969a1a669241a72e78ff5a62998a984 /arm_compute
parent: 5d9d019b2c7ca3dc59bfbb44b3169ee5cd71dc79 (diff)
download: ComputeLibrary-d05dce46a14a7b67f322328ecd95bf96bdd30bae.tar.gz
6 files changed, 87 insertions, 28 deletions
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h
index 8d59ba3248..ca10bfaab2 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h
@@ -55,7 +55,7 @@ public:
     /** Set the input and output of the kernel.
      *
      * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F32
+     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8, F32
      * @param[out] output      The output tensor. First 3 lower dimensions represent a transform of each 3D input,
      *                         while every dimension above 3 represents a batch. Data types supported: Same as @p input
      * @param[in]  kernel_dims The kernel dimensions (width and height).
@@ -68,11 +68,25 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    const ITensor *_input;
-    ITensor       *_output;
-    Size2D         _kernel_dims;
-    PadStrideInfo  _conv_info;
-    bool           _has_bias;
+    /** Template function to run the im2col used for the depthwise convolution layer case
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T>
+    void run_generic(const Window &window);
+    /** Common signature for all the specialised depthwise im2col functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using DepthwiseIm2ColFunctionPtr = void (NEDepthwiseIm2ColKernel::*)(const Window &window);
+
+private:
+    DepthwiseIm2ColFunctionPtr _func;
+    const ITensor             *_input;
+    ITensor                   *_output;
+    Size2D                     _kernel_dims;
+    PadStrideInfo              _conv_info;
+    bool                       _has_bias;
 };
 } // arm_compute
 #endif /*__ARM_COMPUTE_NEDEPTHWISEIM2COLKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h
index 19000905b0..458cbd7812 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h
@@ -56,7 +56,7 @@ public:
     NEDepthwiseVectorToTensorKernel &operator=(NEDepthwiseVectorToTensorKernel &&) = default;
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input  The input vector to convert. Data type supported: F32.
+     * @param[in]  input  The input vector to convert. Data type supported: QASYMM8/S32/F32.
      * @param[out] output The output tensor. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: same as @p input.
      * @param[in]  conv_w The converted tensor's width.
      * @param[in]  conv_h The converted tensor's height.
@@ -67,8 +67,22 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    const ITensor *_input;
-    ITensor       *_output;
+    /** Template function to run the vector to tensor reshape used for the depthwise convolution layer case
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T>
+    void vector_to_tensor(const Window &window);
+    /** Common signature for all the specialised depthwise vector to tensor functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using DepthwiseVectorToTensorFunctionPtr = void (NEDepthwiseVectorToTensorKernel::*)(const Window &window);
+
+private:
+    DepthwiseVectorToTensorFunctionPtr _func;
+    const ITensor                     *_input;
+    ITensor                           *_output;
     std::pair<size_t, size_t> _conv_dims;
 };
 } // arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h
index 4d23b8bd65..d00e8a46ed 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h
@@ -53,7 +53,7 @@ public:
     NEDepthwiseWeightsReshapeKernel &operator=(NEDepthwiseWeightsReshapeKernel &&) = default;
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input  The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: F32.
+     * @param[in]  input  The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: QASYMM8, F32.
      * @param[out] output The output tensor. Data type supported: same as @p input.
      * @param[in]  biases (Optional) The input biases to add. Shape [IFM]. Data type supported: same as @p input.
      */
@@ -63,9 +63,13 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    const ITensor *_input;
-    ITensor       *_output;
-    const ITensor *_biases;
+    using DepthwiseWeightsReshapeFunction = void(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window);
+
+private:
+    DepthwiseWeightsReshapeFunction *_func;
+    const ITensor                   *_input;
+    ITensor                         *_output;
+    const ITensor                   *_biases;
 };
 } // arm_compute
 #endif /*__ARM_COMPUTE_NEDEPTHWISEWEIGHTSRESHAPEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
index 5ea83901f4..95fe916a3c 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
@@ -49,7 +49,7 @@ public:
     NEGEMMMatrixVectorMultiplyKernel &operator=(NEGEMMMatrixVectorMultiplyKernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input0 First Input tensor. Data types supported: F16/F32
+     * @param[in]  input0 First Input tensor. Data types supported: QASYMM8/F32
      * @param[in]  input1 Second Input tensor. Data types supported: same as @p input.
      * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
      */
@@ -57,11 +57,32 @@ public:
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
 
 private:
-    const ITensor *_input0;
-    const ITensor *_input1;
-    ITensor       *_output;
+    /** Template function to run the matrix vector multiplication
+     *
+     * @tparam I0 Input 0 type
+     * @tparam I1 Input 1 type
+     * @tparam O  Output type
+     *
+     * @param[in] window_in  Input region. (Must be a valid region of the window returned by window()).
+     * @param[in] window_w   Weights region. (Must be a valid region of the window returned by window()).
+     * @param[in] window_out Output region.(Must be a valid region of the window returned by window()).
+     */
+    template <typename I0, typename I1, typename O>
+    void matrix_vector_multiply(const Window &window_in, const Window &window_w, const Window &window_out);
+    /** Common signature for all the specialised matrix vector multiplication functions */
+    using GEMMMatrixVectorMultiplyFunctionPtr = void (NEGEMMMatrixVectorMultiplyKernel::*)(const Window &window_in,
+                                                                                           const Window &window_w,
+                                                                                           const Window &window_out);
+
+private:
+    GEMMMatrixVectorMultiplyFunctionPtr _func;
+    const ITensor                      *_input0;
+    const ITensor                      *_input1;
+    ITensor                            *_output;
+    BorderSize                          _border_size;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_*/
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 6ecfdf0323..26384651f1 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -116,8 +116,9 @@ inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input,
 
     unsigned int output_width  = 0;
     unsigned int output_height = 0;
-    std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(),
-                                                              weights_shape.y(), conv_info);
+    std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(),
+                                                              weights_shape.x(), weights_shape.y(),
+                                                              conv_info);
 
     TensorShape output_shape{ input_shape };
     output_shape.set(0, output_width);
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 2100828f0d..e89ef88562 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -54,7 +54,7 @@ public:
     NEDepthwiseConvolutionLayer3x3();
     /** Initialize the function's source, destination, kernels and border_size.
      *
-     * @param[in, out] input     Source tensor. Data type supported: QASYMM8, F32. (Written to only for border filling).
+     * @param[in, out] input     Source tensor. Data type supported: QASYMM8/F32. (Written to only for border filling).
      * @param[in]      weights   Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input.
      * @param[in]      biases    (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
      *                           Data type supported: Same as @p input.
@@ -90,7 +90,7 @@ public:
     NEDepthwiseConvolutionLayer();
     /** Initialize the function's source, destination, weights and convolution information.
      *
-     * @param[in, out] input     Source tensor. Data type supported: F32. (Written to only for border filling).
+     * @param[in, out] input     Source tensor. Data type supported: QASYMM8/F32. (Written to only for border filling).
      * @param[out]     output    Destination tensor. Data type supported: same as @p input.
      * @param[in]      weights   Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
      * @param[in]      biases    (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
@@ -103,13 +103,18 @@ public:
     void run() override;
 
 private:
-    NEDepthwiseIm2ColKernel          _im2col_kernel;
-    NEDepthwiseWeightsReshapeKernel  _weights_reshape_kernel;
-    NEGEMMMatrixVectorMultiplyKernel _v2mm_kernel;
-    NEDepthwiseVectorToTensorKernel  _vector_to_tensor_kernel;
-    Tensor                           _input_reshaped;
-    Tensor                           _weights_reshaped;
-    Tensor                           _v2mm_output;
+    NEDepthwiseIm2ColKernel                   _im2col_kernel;
+    NEDepthwiseWeightsReshapeKernel           _weights_reshape_kernel;
+    NEGEMMMatrixVectorMultiplyKernel          _v2mm_kernel;
+    NEDepthwiseVectorToTensorKernel           _vector_to_tensor_kernel;
+    NEDirectConvolutionLayerOutputStageKernel _output_stage_kernel;
+    NEFillBorderKernel                        _v2mm_input_fill_border;
+    NEFillBorderKernel                        _v2mm_weights_fill_border;
+    Tensor                                    _input_reshaped;
+    Tensor                                    _weights_reshaped;
+    Tensor                                    _v2mm_output;
+    Tensor                                    _output_reshaped;
+    bool                                      _is_quantized;
 };
 }
 #endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H__ */
 \ No newline at end of file
author	Georgios Pinitas <georgios.pinitas@arm.com>	2018-01-22 16:29:17 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:45:00 +0000
commit	d05dce46a14a7b67f322328ecd95bf96bdd30bae (patch)
tree	6e001f539969a1a669241a72e78ff5a62998a984 /arm_compute
parent	5d9d019b2c7ca3dc59bfbb44b3169ee5cd71dc79 (diff)
download	ComputeLibrary-d05dce46a14a7b67f322328ecd95bf96bdd30bae.tar.gz