From adfcacc8e39888a9a62e33c178041642d0a3047a Mon Sep 17 00:00:00 2001
From: Annop Wongwathanarat <annop.wongwathanarat@arm.com>
Date: Wed, 1 Mar 2023 15:19:50 +0000
Subject: Add weights_info as optional input for NEDeconvolutionLayer

This is so that we can leverage fixed format kernel when
using gemm convolution method.

Partially resolves: [ONCPUML-1129]

Change-Id: I61ffa74f5cd9d75579dbc1f9aa187371f855e932
Signed-off-by: Annop Wongwathanarat <annop.wongwathanarat@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9248
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
---
 .../runtime/NEON/functions/NEDeconvolutionLayer.h  | 19 +++++++-----
 .../NEON/functions/NEDeconvolutionLayer.cpp        | 35 +++++++++++-----------
 2 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index 869df69f11..cdc3a636b0 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -110,12 +110,14 @@ public:
      * @param[in]     bias             Optional, ignored if NULL. The biases have one dimension.
      *                                 Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
      * @param[out]    output           Output tensor. The output has the same number of dimensions as the @p input.
-     * @param[in]     info             Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo.
+     * @param[in]     info             Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo.
      * @param[in]     enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                                            available which may introduce a drop of accuracy as well. Default is false
+     *                                 available which may introduce a drop of accuracy as well. Default is false
+     * @param[in]     weights_info     (Optional) Specifies the weight format. Default is unspecified. This parameter can be used to specify the weight format that is optimal for
+     *                                 the GEMM convolution.
      *
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, bool enable_fast_math = false);
+    void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, bool enable_fast_math = false, const WeightsInfo &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEDeconvolutionLayer
      *
      * @param[in] input            Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
@@ -124,13 +126,16 @@ public:
      *                             Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
      * @param[in] bias             (Optional) The biases have one dimension. Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
      * @param[in] output           Output tensor info. The output has the same number of dimensions as the @p input.
-     * @param[in] info             Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo.
+     * @param[in] info             Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo.
      * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                                        available which may introduce a drop of accuracy as well. Default is false
+     *                             available which may introduce a drop of accuracy as well. Default is false
+     * @param[in] weights_info     (Optional) Specifies the weight format. Default is unspecified. This parameter can be used to specify the weight format that is optimal for
+     *                             the GEMM convolution.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info, bool enable_fast_math = false);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info,
+                           bool enable_fast_math = false, const WeightsInfo &weights_info = WeightsInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -150,5 +155,5 @@ private:
     bool               _is_prepared;
     bool               _do_upsampling;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEDECONVOLUTIONLAYER_H */
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 96ffedd5b4..8534d2a8f3 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -82,7 +82,8 @@ NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memor
 {
 }
 
-Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info, bool enable_fast_math)
+Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info,
+                                      bool enable_fast_math, const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
@@ -152,26 +153,26 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != scale_out_info.dimension(batches_idx));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != scale_out_info.dimension(channel_idx));
 
-    if (do_upsampling)
+    if(do_upsampling)
     {
         const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo(), Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math));
     }
     else
     {
         const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, WeightsInfo(), Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math));
     }
 
     return Status{};
 }
 
-void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, bool enable_fast_math)
+void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, bool enable_fast_math, const WeightsInfo &weights_info)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info, enable_fast_math));
-    ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, enable_fast_math);
+    ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info, enable_fast_math, weights_info));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, enable_fast_math, weights_info);
 
     const DataLayout   data_layout = input->info()->data_layout();
     const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -198,12 +199,12 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
     _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
 
     // setup the function to convolve the upscaled output
-    uint32_t            deconv_pad_x = 0;
-    uint32_t            deconv_pad_y = 0;
-    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(),
-                                                                              stride_x, stride_y,
-                                                                              out_dims, deconv_pad_x, deconv_pad_y);
-    const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
+    uint32_t            deconv_pad_x    = 0;
+    uint32_t            deconv_pad_y    = 0;
+    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(),
+                                                                                stride_x, stride_y,
+                                                                                out_dims, deconv_pad_x, deconv_pad_y);
+    const PadStrideInfo upsample_info   = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
 
     // Do not perform upsampling when the operation uses unit stride in all dimensions
     _do_upsampling = stride_x != 1 || stride_y != 1;
@@ -215,12 +216,12 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
     axis_data[1]   = static_cast<uint32_t>(height_idx);
 
     // Setup convolution and upsampling, if needed
-    if (_do_upsampling)
+    if(_do_upsampling)
     {
         _memory_group.manage(&_scaled_output);
 
         const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-        TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+        TensorInfo          scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
         scale_out_info.set_data_layout(data_layout);
         _scaled_output.allocator()->init(scale_out_info);
 
@@ -228,14 +229,14 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
         // The padding amount can be given as input to the convolution layer.
         _upsample_f.configure(input, &_scaled_output, upsample_info);
 
-        _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, WeightsInfo(), Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math);
+        _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math);
 
         _scaled_output.allocator()->allocate();
     }
     else
     {
         const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
-        _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, WeightsInfo(), Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math);
+        _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math);
     }
 }
 
-- 
cgit v1.2.1