From 980a9168b81d778f4902973b4920b54c103907e0 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Wed, 3 Jun 2020 20:16:46 +0100
Subject: COMPMID-3177: Remove padding from NEBatchNormalizationLayer

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I9be23e6ef1f552eb159e39fda16c82fa20124094
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3307
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 .../NEON/kernels/NEBatchNormalizationLayerKernel.h |  29 ++----
 .../kernels/detail/NEActivationFunctionDetail.h    | 108 +++++++++++++++++----
 2 files changed, 96 insertions(+), 41 deletions(-)

(limited to 'arm_compute/core/NEON')
diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index d59ed7baf0..7371e3c177 100644
--- a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,7 @@
 
 namespace arm_compute
 {
+// Forward declarations
 class ITensor;
 
 /** Interface for the batch normalization layer kernel.
@@ -97,40 +98,26 @@ private:
     /** Configure execution function in case of fused activation **/
     void configure_fused();
 
-    /** Template function to run batch normalization on fp16
-     *
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool fused_activation, typename F>
-    void batch_normalization_fp16_nchw(const Window &window);
-    /** Template function to run batch normalization on fp16 on tensors with NHWC format
-     *
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool fused_activation, typename F>
-    void batch_normalization_fp16_nhwc(const Window &window);
     /** Template function to run batch normalization on fp32
      *
+     * @tparam T                Specialization data type
      * @tparam fused_activation Boolean that flags if its a fused activation or not
      * @tparam F                Activation function functor to run
      *
      * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
      */
-    template <bool fused_activation, typename F>
-    void batch_normalization_fp32_nchw(const Window &window);
+    template <typename T, bool fused_activation, typename F>
+    void batch_normalization_nchw(const Window &window);
     /** Template function to run batch normalization on fp32 on tensors with NHWC format
      *
+     * @tparam T                Specialization data type
      * @tparam fused_activation Boolean that flags if its a fused activation or not
      * @tparam F                Activation function functor to run
      *
      * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
      */
-    template <bool fused_activation, typename F>
-    void batch_normalization_fp32_nhwc(const Window &window);
+    template <typename T, bool fused_activation, typename F>
+    void batch_normalization_nhwc(const Window &window);
     /** Common signature for all the batch normalization functions
      *
      * @param[in] window Region on which to execute the kernel.
diff --git a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
index 4861559695..7945418ac5 100644
--- a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
+++ b/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,6 +45,7 @@ struct dummy
     {
         ARM_COMPUTE_UNUSED(act_info);
     }
+
     /** Run activation function.
      *
      * @param[in] vval Vector of values.
@@ -53,6 +54,15 @@ struct dummy
     {
         ARM_COMPUTE_UNUSED(vval);
     }
+
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        ARM_COMPUTE_UNUSED(val);
+    }
 };
 /** Linear activation object */
 template <typename T, int S>
@@ -68,8 +78,10 @@ struct linear
      * @param[in] act_info Activation layer information.
      */
     explicit linear(ActivationLayerInfo act_info)
-        : valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})),
-          vbeta(wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{}))
+        : alpha(act_info.a()),
+          beta(act_info.b()),
+          valpha(wrapper::vdup_n(static_cast<T>(alpha), ExactTagType{})),
+          vbeta(wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}))
     {
     }
 
@@ -79,13 +91,22 @@ struct linear
      */
     void operator()(ExactType &vval)
     {
-        vval = wrapper::vmla(vval, valpha, vbeta);
+        vval = wrapper::vmla(vbeta, vval, valpha);
     }
 
-    /** Vector of alphas. */
-    const ExactType valpha;
-    /** Vector of betas. */
-    const ExactType vbeta;
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = alpha * val + beta;
+    }
+
+    const T         alpha;  /**< Scalar alpha */
+    const T         beta;   /**< Scalar alpha */
+    const ExactType valpha; /**< Vector of alphas. */
+    const ExactType vbeta;  /**< Vector of betas. */
 };
 /** Square activation object */
 template <typename T, int S>
@@ -113,6 +134,15 @@ struct square
     {
         vval = wrapper::vmul(vval, vval);
     }
+
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = val * val;
+    }
 };
 /** Logistic activation object */
 template <typename T, int S>
@@ -128,7 +158,7 @@ struct logistic
      * @param[in] act_info Activation layer information.
      */
     explicit logistic(ActivationLayerInfo act_info)
-        : vone(wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{}))
+        : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{}))
     {
         ARM_COMPUTE_UNUSED(act_info);
     }
@@ -142,6 +172,15 @@ struct logistic
         vval = wrapper::vinv(wrapper::vadd(vone, wrapper::vexpq(wrapper::vneg(vval))));
     }
 
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = 1 / (1 + std::exp(-val));
+    }
+
     /** Vector of ones. */
     const ExactType vone;
 };
@@ -159,7 +198,7 @@ struct relu
      * @param[in] act_info Activation layer information.
      */
     explicit relu(ActivationLayerInfo act_info)
-        : vzero(wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}))
+        : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{}))
     {
         ARM_COMPUTE_UNUSED(act_info);
     }
@@ -173,6 +212,15 @@ struct relu
         vval = wrapper::vmax(vzero, vval);
     }
 
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = std::max(static_cast<T>(0), val);
+    }
+
     /** Vector of zeroes. */
     const ExactType vzero;
 };
@@ -190,7 +238,8 @@ struct brelu
      * @param[in] act_info Activation layer information.
      */
     explicit brelu(ActivationLayerInfo act_info)
-        : vzero(wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{})),
+        : alpha(act_info.a()),
+          vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{})),
           valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{}))
     {
     }
@@ -204,10 +253,18 @@ struct brelu
         vval = wrapper::vmin(valpha, wrapper::vmax(vzero, vval));
     }
 
-    /** Vector of zeroes. */
-    const ExactType vzero;
-    /** Vector of alphas. */
-    const ExactType valpha;
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = std::min(alpha, std::max(static_cast<T>(0), val));
+    }
+
+    const T         alpha;  /** Scalar alpha */
+    const ExactType vzero;  /** Vector of zeroes. */
+    const ExactType valpha; /** Vector of alphas. */
 };
 /** Lower-Upper Bounded RELU activation object */
 template <typename T, int S>
@@ -223,7 +280,9 @@ struct lubrelu
      * @param[in] act_info Activation layer information.
      */
     explicit lubrelu(ActivationLayerInfo act_info)
-        : valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})),
+        : alpha(act_info.a()),
+          beta(act_info.b()),
+          valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})),
           vbeta(wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{}))
     {
     }
@@ -237,10 +296,19 @@ struct lubrelu
         vval = wrapper::vmin(valpha, wrapper::vmax(vbeta, vval));
     }
 
-    /** Vector of alphas. */
-    const ExactType valpha;
-    /** Vector of betas. */
-    const ExactType vbeta;
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = std::min(alpha, std::max(beta, val));
+    }
+
+    const T         alpha;  /**< Scalar alpha */
+    const T         beta;   /**< Scalar alpha */
+    const ExactType valpha; /** Vector of alphas. */
+    const ExactType vbeta;  /** Vector of betas. */
 };
 } // namespace detail
 } // namespace arm_compute
-- 
cgit v1.2.1