From 5264b7d5555ec980f9c52c719122479d0d676af8 Mon Sep 17 00:00:00 2001
From: Pablo Tello <pablo.tello@arm.com>
Date: Mon, 21 Oct 2019 14:25:41 +0100
Subject: COMPMID-2576: Fuse activation in Winograd output transform.

Change-Id: I26dd1307847adeaaefae0a7374b9858c07d71372
Signed-off-by: Pablo Tello <pablo.tello@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2172
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
---
 .../NEON/functions/NEWinogradConvolutionLayer.cpp  | 68 ++++++++++++++++------
 1 file changed, 50 insertions(+), 18 deletions(-)

(limited to 'src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp')
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index e699ad1815..6983c1c01b 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -33,6 +33,7 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 #include "support/ToolchainSupport.h"
 
+#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
 #include "arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp"
 
 namespace arm_compute
@@ -232,6 +233,31 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz
     return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
 }
 
+inline bool fuse_function_supported(const ActivationLayerInfo &act_info)
+{
+    return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ||
+           act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
+}
+
+arm_gemm::Activation arm_gemm_activation_from_acl_activation(const ActivationLayerInfo &act_info)
+{
+        switch(act_info.activation())
+        {
+            case ActivationLayerInfo::ActivationFunction::RELU:
+            {
+                return arm_gemm::Activation(arm_gemm::Activation::Type::ReLU, act_info.a(), act_info.b());
+            }
+            case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+            {
+                return arm_gemm::Activation(arm_gemm::Activation::Type::BoundedReLU, act_info.a(), act_info.b());
+            }
+            default:
+            {
+                return arm_gemm::Activation(arm_gemm::Activation::Type::None);
+            }
+        }
+}
+
 } //namespace
 
 NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
@@ -257,6 +283,8 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
     const Size2D kernel_size = Size2D(weights->info()->dimension(width_idx), weights->info()->dimension(height_idx));
     const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
 
+
+
     // Check if the Winograd configuration requires fast math
     if(!enable_fast_math)
     {
@@ -388,21 +416,15 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
                                       * data_type_size;
 
     // Output storage
-    const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels,
-                                                                                        use_same_padding)
-                                       * data_type_size;
-    ;
-    const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
-    const int         kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
-
-    const int  output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
-    const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
-
-    const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
+    const size_t output_storage_size  = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels) * data_type_size;
+    const int    kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(out_channels, in_channels);
+    const int    output_matrix_stride = transform_output_kernel->get_matrix_stride(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels);
+    const auto   output_shape         = transform_output_kernel->get_output_shape(in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
+    const int    input_matrix_stride  = transform_input_kernel->get_matrix_stride(in_shape.n_batches, in_channels, in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
 
     // Configure GEMM
-    const int tile_rows                = iceildiv(output_shape.n_rows, output_tile.height);
-    const int tile_cols                = iceildiv(output_shape.n_cols, output_tile.width);
+    const int tile_rows                = iceildiv(output_shape.first, output_tile.height);
+    const int tile_cols                = iceildiv(output_shape.second, output_tile.width);
     const int m                        = in_shape.n_batches * tile_rows * tile_cols;
     const int k                        = in_shape.n_channels;
     const int n                        = out_channels;
@@ -489,9 +511,19 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
         _memory_group.manage(&_output_nhwc);
         output_to_use = &_output_nhwc;
     }
-    transform_output_kernel->configure(biases, &_output_transformed,
-                                       output_matrix_stride, output_to_use,
-                                       in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels, &_output_workspace);
+    const  arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info);
+
+    transform_output_kernel->configure(biases,
+                                       &_output_transformed,
+                                       output_matrix_stride,
+                                       output_to_use,
+                                       in_shape.n_batches,
+                                       output_shape.first,
+                                       output_shape.second,
+                                       out_channels,
+                                       &_output_workspace,
+                                       activation);
+
     const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
     TensorInfo   output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type());
     _output_workspace.allocator()->init(output_workspace_info);
@@ -510,7 +542,7 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
     _transform_output_kernel  = std::move(transform_output_kernel);
 
     //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
+    _is_activationlayer_enabled = act_info.enabled() && ! fuse_function_supported(act_info);
     if(_is_activationlayer_enabled)
     {
         _activationlayer_function.configure(_output, nullptr, act_info);
@@ -546,7 +578,7 @@ void NEWinogradConvolutionLayer::run()
         _permute_output.run();
     }
 
-    if(_is_activationlayer_enabled)
+    if(_is_activationlayer_enabled )
     {
         _activationlayer_function.run();
     }
-- 
cgit v1.2.1